1331722Seadler/* 2227068Sambrisko * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3227068Sambrisko */ 4227068Sambrisko 5227068Sambrisko//===----------------------------------------------------------------------===// 6227068Sambrisko// 7227068Sambrisko// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8227068Sambrisko// See https://llvm.org/LICENSE.txt for license information. 9227068Sambrisko// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10227068Sambrisko// 11227068Sambrisko//===----------------------------------------------------------------------===// 12227068Sambrisko 13227068Sambrisko/* Dynamic scheduling initialization and dispatch. 14227068Sambrisko * 15227068Sambrisko * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 16227068Sambrisko * it may change values between parallel regions. __kmp_max_nth 17227068Sambrisko * is the largest value __kmp_nth may take, 1 is the smallest. 18227068Sambrisko */ 19227068Sambrisko 20227068Sambrisko#include "kmp.h" 21227068Sambrisko#include "kmp_error.h" 22227068Sambrisko#include "kmp_i18n.h" 23227068Sambrisko#include "kmp_itt.h" 24227068Sambrisko#include "kmp_stats.h" 25227068Sambrisko#include "kmp_str.h" 26227068Sambrisko#if KMP_USE_X87CONTROL 27227068Sambrisko#include <float.h> 28227068Sambrisko#endif 29227068Sambrisko#include "kmp_lock.h" 30227068Sambrisko#include "kmp_dispatch.h" 31227068Sambrisko#if KMP_USE_HIER_SCHED 32227068Sambrisko#include "kmp_dispatch_hier.h" 33227068Sambrisko#endif 34233711Sambrisko 35227068Sambrisko#if OMPT_SUPPORT 36227068Sambrisko#include "ompt-specific.h" 37227068Sambrisko#endif 38227068Sambrisko 39227068Sambrisko/* ------------------------------------------------------------------------ */ 40227068Sambrisko/* ------------------------------------------------------------------------ */ 41227068Sambrisko 42227068Sambriskovoid __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 43227068Sambrisko kmp_info_t *th; 44227068Sambrisko 45227068Sambrisko KMP_DEBUG_ASSERT(gtid_ref); 46227068Sambrisko 47227068Sambrisko if (__kmp_env_consistency_check) { 48227068Sambrisko th = __kmp_threads[*gtid_ref]; 49227068Sambrisko if (th->th.th_root->r.r_active && 50233711Sambrisko (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 51227068Sambrisko#if KMP_USE_DYNAMIC_LOCK 52227068Sambrisko __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 53227068Sambrisko#else 54227068Sambrisko __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 55227068Sambrisko#endif 56227068Sambrisko } 57227068Sambrisko } 58247369Ssmh} 59227068Sambrisko 60227068Sambriskovoid __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 61227068Sambrisko kmp_info_t *th; 62227068Sambrisko 63227068Sambrisko if (__kmp_env_consistency_check) { 64227068Sambrisko th = __kmp_threads[*gtid_ref]; 65227068Sambrisko if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 66227068Sambrisko __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 67227068Sambrisko } 68227068Sambrisko } 69227068Sambrisko} 70227068Sambrisko 71227068Sambrisko// Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC 72227068Sambriskostatic inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule, 73227068Sambrisko bool use_hier = false) { 74227068Sambrisko // Pick up the nonmonotonic/monotonic bits from the scheduling type 75227068Sambrisko // Nonmonotonic as default for dynamic schedule when no modifier is specified 76227068Sambrisko int monotonicity = SCHEDULE_NONMONOTONIC; 77227068Sambrisko 78227068Sambrisko // Let default be monotonic for executables 79227068Sambrisko // compiled with OpenMP* 4.5 or less compilers 80235014Sambrisko if (loc != NULL && loc->get_openmp_version() < 50) 81235014Sambrisko monotonicity = SCHEDULE_MONOTONIC; 82227068Sambrisko 83227068Sambrisko if (use_hier || __kmp_force_monotonic) 84227068Sambrisko monotonicity = SCHEDULE_MONOTONIC; 85247369Ssmh else if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 86247369Ssmh monotonicity = SCHEDULE_NONMONOTONIC; 87247369Ssmh else if (SCHEDULE_HAS_MONOTONIC(schedule)) 88247369Ssmh monotonicity = SCHEDULE_MONOTONIC; 89247369Ssmh 90247369Ssmh return monotonicity; 91247369Ssmh} 92247369Ssmh 93227068Sambrisko#if KMP_WEIGHTED_ITERATIONS_SUPPORTED 94227068Sambrisko// Return floating point number rounded to two decimal points 95227068Sambriskostatic inline float __kmp_round_2decimal_val(float num) { 96227068Sambrisko return (float)(static_cast<int>(num * 100 + 0.5)) / 100; 97227068Sambrisko} 98227068Sambriskostatic inline int __kmp_get_round_val(float num) { 99227068Sambrisko return static_cast<int>(num < 0 ? num - 0.5 : num + 0.5); 100227068Sambrisko} 101227068Sambrisko#endif 102227068Sambrisko 103227068Sambriskotemplate <typename T> 104227068Sambriskoinline void 105227068Sambrisko__kmp_initialize_self_buffer(kmp_team_t *team, T id, 106227068Sambrisko dispatch_private_info_template<T> *pr, 107227068Sambrisko typename traits_t<T>::unsigned_t nchunks, T nproc, 108227068Sambrisko typename traits_t<T>::unsigned_t &init, 109227068Sambrisko T &small_chunk, T &extras, T &p_extra) { 110227068Sambrisko 111227068Sambrisko#if KMP_WEIGHTED_ITERATIONS_SUPPORTED 112227068Sambrisko if (pr->flags.use_hybrid) { 113227068Sambrisko kmp_info_t *th = __kmp_threads[__kmp_gtid_from_tid((int)id, team)]; 114227068Sambrisko kmp_hw_core_type_t type = 115227068Sambrisko (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type; 116227068Sambrisko T pchunks = pr->u.p.pchunks; 117227068Sambrisko T echunks = nchunks - pchunks; 118227068Sambrisko T num_procs_with_pcore = pr->u.p.num_procs_with_pcore; 119227068Sambrisko T num_procs_with_ecore = nproc - num_procs_with_pcore; 120233711Sambrisko T first_thread_with_ecore = pr->u.p.first_thread_with_ecore; 121227068Sambrisko T big_chunk = 122227068Sambrisko pchunks / num_procs_with_pcore; // chunks per thread with p-core 123233711Sambrisko small_chunk = 124227068Sambrisko echunks / num_procs_with_ecore; // chunks per thread with e-core 125227068Sambrisko 126227068Sambrisko extras = 127227068Sambrisko (pchunks % num_procs_with_pcore) + (echunks % num_procs_with_ecore); 128227068Sambrisko 129233711Sambrisko p_extra = (big_chunk - small_chunk); 130227068Sambrisko 131227068Sambrisko if (type == KMP_HW_CORE_TYPE_CORE) { 132227068Sambrisko if (id < first_thread_with_ecore) { 133227068Sambrisko init = id * small_chunk + id * p_extra + (id < extras ? id : extras); 134227068Sambrisko } else { 135227068Sambrisko init = id * small_chunk + (id - num_procs_with_ecore) * p_extra + 136227068Sambrisko (id < extras ? id : extras); 137227068Sambrisko } 138233711Sambrisko } else { 139227068Sambrisko if (id == first_thread_with_ecore) { 140227068Sambrisko init = id * small_chunk + id * p_extra + (id < extras ? id : extras); 141227068Sambrisko } else { 142227068Sambrisko init = id * small_chunk + first_thread_with_ecore * p_extra + 143233711Sambrisko (id < extras ? id : extras); 144233711Sambrisko } 145227068Sambrisko } 146227068Sambrisko p_extra = (type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0; 147235016Sambrisko return; 148227068Sambrisko } 149227068Sambrisko#endif 150227068Sambrisko 151235016Sambrisko small_chunk = nchunks / nproc; // chunks per thread 152235016Sambrisko extras = nchunks % nproc; 153227068Sambrisko p_extra = 0; 154233711Sambrisko init = id * small_chunk + (id < extras ? id : extras); 155233711Sambrisko} 156227068Sambrisko 157233711Sambrisko#if KMP_STATIC_STEAL_ENABLED 158233711Sambriskoenum { // values for steal_flag (possible states of private per-loop buffer) 159233711Sambrisko UNUSED = 0, 160233711Sambrisko CLAIMED = 1, // owner thread started initialization 161233711Sambrisko READY = 2, // available for stealing 162233711Sambrisko THIEF = 3 // finished by owner, or claimed by thief 163227068Sambrisko // possible state changes: 164233711Sambrisko // 0 -> 1 owner only, sync 165227068Sambrisko // 0 -> 3 thief only, sync 166233711Sambrisko // 1 -> 2 owner only, async 167227068Sambrisko // 2 -> 3 owner only, async 168233711Sambrisko // 3 -> 2 owner only, async 169233711Sambrisko // 3 -> 0 last thread finishing the loop, async 170233711Sambrisko}; 171247369Ssmh#endif 172247369Ssmh 173227068Sambrisko// Initialize a dispatch_private_info_template<T> buffer for a particular 174233711Sambrisko// type of schedule,chunk. The loop description is found in lb (lower bound), 175233711Sambrisko// ub (upper bound), and st (stride). nproc is the number of threads relevant 176233711Sambrisko// to the scheduling (often the number of threads in a team, but not always if 177227068Sambrisko// hierarchical scheduling is used). tid is the id of the thread calling 178247369Ssmh// the function within the group of nproc threads. It will have a value 179227068Sambrisko// between 0 and nproc - 1. This is often just the thread id within a team, but 180233711Sambrisko// is not necessarily the case when using hierarchical scheduling. 181227068Sambrisko// loc is the source file location of the corresponding loop 182233711Sambrisko// gtid is the global thread id 183233711Sambriskotemplate <typename T> 184233711Sambriskovoid __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, 185227068Sambrisko dispatch_private_info_template<T> *pr, 186233711Sambrisko enum sched_type schedule, T lb, T ub, 187233711Sambrisko typename traits_t<T>::signed_t st, 188233711Sambrisko#if USE_ITT_BUILD 189233711Sambrisko kmp_uint64 *cur_chunk, 190247369Ssmh#endif 191247369Ssmh typename traits_t<T>::signed_t chunk, 192227068Sambrisko T nproc, T tid) { 193233711Sambrisko typedef typename traits_t<T>::unsigned_t UT; 194233711Sambrisko typedef typename traits_t<T>::floating_t DBL; 195233711Sambrisko 196233711Sambrisko int active; 197227068Sambrisko T tc; 198227068Sambrisko kmp_info_t *th; 199227068Sambrisko kmp_team_t *team; 200235016Sambrisko int monotonicity; 201227068Sambrisko bool use_hier; 202235016Sambrisko 203235016Sambrisko#ifdef KMP_DEBUG 204227068Sambrisko typedef typename traits_t<T>::signed_t ST; 205227068Sambrisko { 206227068Sambrisko char *buff; 207227068Sambrisko // create format specifiers before the debug output 208227068Sambrisko buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called " 209227068Sambrisko "pr:%%p lb:%%%s ub:%%%s st:%%%s " 210227068Sambrisko "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n", 211227068Sambrisko traits_t<T>::spec, traits_t<T>::spec, 212227068Sambrisko traits_t<ST>::spec, traits_t<ST>::spec, 213227068Sambrisko traits_t<T>::spec, traits_t<T>::spec); 214227068Sambrisko KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid)); 215227068Sambrisko __kmp_str_free(&buff); 216227068Sambrisko } 217227068Sambrisko#endif 218227068Sambrisko /* setup data */ 219227068Sambrisko th = __kmp_threads[gtid]; 220227068Sambrisko team = th->th.th_team; 221227068Sambrisko active = !team->t.t_serialized; 222227068Sambrisko 223227068Sambrisko#if USE_ITT_BUILD 224227068Sambrisko int itt_need_metadata_reporting = 225227068Sambrisko __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 226227068Sambrisko KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 227227068Sambrisko team->t.t_active_level == 1; 228227068Sambrisko#endif 229227068Sambrisko 230227068Sambrisko#if KMP_USE_HIER_SCHED 231227068Sambrisko use_hier = pr->flags.use_hier; 232227068Sambrisko#else 233227068Sambrisko use_hier = false; 234227068Sambrisko#endif 235227068Sambrisko 236227068Sambrisko /* Pick up the nonmonotonic/monotonic bits from the scheduling type */ 237227068Sambrisko monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 238227068Sambrisko schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 239227068Sambrisko 240227068Sambrisko /* Pick up the nomerge/ordered bits from the scheduling type */ 241227068Sambrisko if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 242227068Sambrisko pr->flags.nomerge = TRUE; 243227068Sambrisko schedule = 244227068Sambrisko (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 245235016Sambrisko } else { 246235016Sambrisko pr->flags.nomerge = FALSE; 247227068Sambrisko } 248227068Sambrisko pr->type_size = traits_t<T>::type_size; // remember the size of variables 249235016Sambrisko if (kmp_ord_lower & schedule) { 250235016Sambrisko pr->flags.ordered = TRUE; 251227068Sambrisko schedule = 252227068Sambrisko (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 253233711Sambrisko } else { 254227068Sambrisko pr->flags.ordered = FALSE; 255227068Sambrisko } 256233711Sambrisko // Ordered overrides nonmonotonic 257227068Sambrisko if (pr->flags.ordered) { 258227068Sambrisko monotonicity = SCHEDULE_MONOTONIC; 259227068Sambrisko } 260227068Sambrisko 261227068Sambrisko if (schedule == kmp_sch_static) { 262227068Sambrisko schedule = __kmp_static; 263227068Sambrisko } else { 264227068Sambrisko if (schedule == kmp_sch_runtime) { 265227068Sambrisko // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 266227068Sambrisko // not specified) 267227068Sambrisko schedule = team->t.t_sched.r_sched_type; 268227068Sambrisko monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 269227068Sambrisko schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 270227068Sambrisko if (pr->flags.ordered) // correct monotonicity for ordered loop if needed 271235016Sambrisko monotonicity = SCHEDULE_MONOTONIC; 272235016Sambrisko // Detail the schedule if needed (global controls are differentiated 273227068Sambrisko // appropriately) 274227068Sambrisko if (schedule == kmp_sch_guided_chunked) { 275227068Sambrisko schedule = __kmp_guided; 276227068Sambrisko } else if (schedule == kmp_sch_static) { 277227068Sambrisko schedule = __kmp_static; 278227068Sambrisko } 279227068Sambrisko // Use the chunk size specified by OMP_SCHEDULE (or default if not 280227068Sambrisko // specified) 281227068Sambrisko chunk = team->t.t_sched.chunk; 282233711Sambrisko#if USE_ITT_BUILD 283227068Sambrisko if (cur_chunk) 284227068Sambrisko *cur_chunk = chunk; 285227068Sambrisko#endif 286227068Sambrisko#ifdef KMP_DEBUG 287227068Sambrisko { 288227068Sambrisko char *buff; 289227068Sambrisko // create format specifiers before the debug output 290227068Sambrisko buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: " 291227068Sambrisko "schedule:%%d chunk:%%%s\n", 292227068Sambrisko traits_t<ST>::spec); 293227068Sambrisko KD_TRACE(10, (buff, gtid, schedule, chunk)); 294227068Sambrisko __kmp_str_free(&buff); 295227068Sambrisko } 296227068Sambrisko#endif 297233711Sambrisko } else { 298227068Sambrisko if (schedule == kmp_sch_guided_chunked) { 299227068Sambrisko schedule = __kmp_guided; 300227068Sambrisko } 301227068Sambrisko if (chunk <= 0) { 302227068Sambrisko chunk = KMP_DEFAULT_CHUNK; 303227068Sambrisko } 304227068Sambrisko } 305227068Sambrisko 306227068Sambrisko if (schedule == kmp_sch_auto) { 307227068Sambrisko // mapping and differentiation: in the __kmp_do_serial_initialize() 308227068Sambrisko schedule = __kmp_auto; 309227068Sambrisko#ifdef KMP_DEBUG 310227068Sambrisko { 311227068Sambrisko char *buff; 312227068Sambrisko // create format specifiers before the debug output 313227068Sambrisko buff = __kmp_str_format( 314227068Sambrisko "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 315227068Sambrisko "schedule:%%d chunk:%%%s\n", 316227068Sambrisko traits_t<ST>::spec); 317227068Sambrisko KD_TRACE(10, (buff, gtid, schedule, chunk)); 318227068Sambrisko __kmp_str_free(&buff); 319227068Sambrisko } 320247369Ssmh#endif 321247369Ssmh } 322227068Sambrisko#if KMP_STATIC_STEAL_ENABLED 323227068Sambrisko // map nonmonotonic:dynamic to static steal 324233711Sambrisko if (schedule == kmp_sch_dynamic_chunked) { 325233711Sambrisko if (monotonicity == SCHEDULE_NONMONOTONIC) 326227068Sambrisko schedule = kmp_sch_static_steal; 327227068Sambrisko } 328227068Sambrisko#endif 329227068Sambrisko /* guided analytical not safe for too many threads */ 330227068Sambrisko if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) { 331235016Sambrisko schedule = kmp_sch_guided_iterative_chunked; 332227068Sambrisko KMP_WARNING(DispatchManyThreads); 333227068Sambrisko } 334227068Sambrisko if (schedule == kmp_sch_runtime_simd) { 335227068Sambrisko // compiler provides simd_width in the chunk parameter 336227068Sambrisko schedule = team->t.t_sched.r_sched_type; 337227068Sambrisko monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 338247369Ssmh schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 339227068Sambrisko // Detail the schedule if needed (global controls are differentiated 340233711Sambrisko // appropriately) 341227068Sambrisko if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 342247369Ssmh schedule == __kmp_static) { 343227068Sambrisko schedule = kmp_sch_static_balanced_chunked; 344227068Sambrisko } else { 345247369Ssmh if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 346247369Ssmh schedule = kmp_sch_guided_simd; 347227068Sambrisko } 348233711Sambrisko chunk = team->t.t_sched.chunk * chunk; 349247369Ssmh } 350227068Sambrisko#if USE_ITT_BUILD 351227068Sambrisko if (cur_chunk) 352227068Sambrisko *cur_chunk = chunk; 353227068Sambrisko#endif 354247369Ssmh#ifdef KMP_DEBUG 355247369Ssmh { 356227068Sambrisko char *buff; 357227068Sambrisko // create format specifiers before the debug output 358247369Ssmh buff = __kmp_str_format( 359247369Ssmh "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d" 360247369Ssmh " chunk:%%%s\n", 361247369Ssmh traits_t<ST>::spec); 362247369Ssmh KD_TRACE(10, (buff, gtid, schedule, chunk)); 363227068Sambrisko __kmp_str_free(&buff); 364227068Sambrisko } 365227068Sambrisko#endif 366227068Sambrisko } 367227068Sambrisko pr->u.p.parm1 = chunk; 368227068Sambrisko } 369227068Sambrisko KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 370227068Sambrisko "unknown scheduling type"); 371227068Sambrisko 372227068Sambrisko pr->u.p.count = 0; 373227068Sambrisko 374247369Ssmh if (__kmp_env_consistency_check) { 375227068Sambrisko if (st == 0) { 376227068Sambrisko __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 377227068Sambrisko (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc); 378227068Sambrisko } 379227068Sambrisko } 380227068Sambrisko // compute trip count 381227068Sambrisko if (st == 1) { // most common case 382227068Sambrisko if (ub >= lb) { 383227068Sambrisko tc = ub - lb + 1; 384227068Sambrisko } else { // ub < lb 385227068Sambrisko tc = 0; // zero-trip 386227068Sambrisko } 387227068Sambrisko } else if (st < 0) { 388227068Sambrisko if (lb >= ub) { 389227068Sambrisko // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 390227068Sambrisko // where the division needs to be unsigned regardless of the result type 391227068Sambrisko tc = (UT)(lb - ub) / (-st) + 1; 392227068Sambrisko } else { // lb < ub 393233711Sambrisko tc = 0; // zero-trip 394233711Sambrisko } 395227068Sambrisko } else { // st > 0 396227068Sambrisko if (ub >= lb) { 397227068Sambrisko // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 398227068Sambrisko // where the division needs to be unsigned regardless of the result type 399227068Sambrisko tc = (UT)(ub - lb) / st + 1; 400233711Sambrisko } else { // ub < lb 401233711Sambrisko tc = 0; // zero-trip 402233711Sambrisko } 403227068Sambrisko } 404227068Sambrisko 405227068Sambrisko#if KMP_STATS_ENABLED 406227068Sambrisko if (KMP_MASTER_GTID(gtid)) { 407227068Sambrisko KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc); 408233711Sambrisko } 409233711Sambrisko#endif 410233711Sambrisko 411227068Sambrisko pr->u.p.lb = lb; 412227068Sambrisko pr->u.p.ub = ub; 413227068Sambrisko pr->u.p.st = st; 414233711Sambrisko pr->u.p.tc = tc; 415233711Sambrisko 416227068Sambrisko#if KMP_OS_WINDOWS 417227068Sambrisko pr->u.p.last_upper = ub + st; 418227068Sambrisko#endif /* KMP_OS_WINDOWS */ 419227068Sambrisko 420227068Sambrisko /* NOTE: only the active parallel region(s) has active ordered sections */ 421227068Sambrisko 422227068Sambrisko if (active) { 423227068Sambrisko if (pr->flags.ordered) { 424227068Sambrisko pr->ordered_bumped = 0; 425227068Sambrisko pr->u.p.ordered_lower = 1; 426227068Sambrisko pr->u.p.ordered_upper = 0; 427227068Sambrisko } 428247369Ssmh } 429227068Sambrisko 430227068Sambrisko switch (schedule) { 431247369Ssmh#if KMP_STATIC_STEAL_ENABLED 432227068Sambrisko case kmp_sch_static_steal: { 433247369Ssmh T ntc, init = 0; 434247369Ssmh 435227068Sambrisko KD_TRACE(100, 436247369Ssmh ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n", 437247369Ssmh gtid)); 438227068Sambrisko 439227068Sambrisko ntc = (tc % chunk ? 1 : 0) + tc / chunk; 440247369Ssmh if (nproc > 1 && ntc >= nproc) { 441247369Ssmh KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL); 442247369Ssmh T id = tid; 443247369Ssmh T small_chunk, extras, p_extra = 0; 444247369Ssmh kmp_uint32 old = UNUSED; 445227068Sambrisko int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED); 446247369Ssmh if (traits_t<T>::type_size > 4) { 447247369Ssmh // AC: TODO: check if 16-byte CAS available and use it to 448227068Sambrisko // improve performance (probably wait for explicit request 449227068Sambrisko // before spending time on this). 450235016Sambrisko // For now use dynamically allocated per-private-buffer lock, 451235016Sambrisko // free memory in __kmp_dispatch_next when status==0. 452227068Sambrisko pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 453227068Sambrisko __kmp_init_lock(pr->u.p.steal_lock); 454233711Sambrisko } 455227068Sambrisko 456233711Sambrisko#if KMP_WEIGHTED_ITERATIONS_SUPPORTED 457227068Sambrisko // Iterations are divided in a 60/40 skewed distribution among CORE and 458227068Sambrisko // ATOM processors for hybrid systems 459227068Sambrisko bool use_hybrid = false; 460227068Sambrisko kmp_hw_core_type_t core_type = KMP_HW_CORE_TYPE_UNKNOWN; 461227068Sambrisko T first_thread_with_ecore = 0; 462227068Sambrisko T num_procs_with_pcore = 0; 463227068Sambrisko T num_procs_with_ecore = 0; 464227068Sambrisko T p_ntc = 0, e_ntc = 0; 465227068Sambrisko if (__kmp_is_hybrid_cpu() && __kmp_affinity.type != affinity_none && 466247369Ssmh __kmp_affinity.type != affinity_explicit) { 467247369Ssmh use_hybrid = true; 468247369Ssmh core_type = (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type; 469247369Ssmh if (core_type != KMP_HW_CORE_TYPE_UNKNOWN && 470247369Ssmh __kmp_first_osid_with_ecore > -1) { 471247369Ssmh for (int i = 0; i < team->t.t_nproc; ++i) { 472247369Ssmh kmp_hw_core_type_t type = (kmp_hw_core_type_t)team->t.t_threads[i] 473227068Sambrisko ->th.th_topology_attrs.core_type; 474227068Sambrisko int id = team->t.t_threads[i]->th.th_topology_ids.os_id; 475227068Sambrisko if (id == __kmp_first_osid_with_ecore) { 476247369Ssmh first_thread_with_ecore = 477247369Ssmh team->t.t_threads[i]->th.th_info.ds.ds_tid; 478247369Ssmh } 479247369Ssmh if (type == KMP_HW_CORE_TYPE_CORE) { 480247369Ssmh num_procs_with_pcore++; 481227068Sambrisko } else if (type == KMP_HW_CORE_TYPE_ATOM) { 482227068Sambrisko num_procs_with_ecore++; 483227068Sambrisko } else { 484227068Sambrisko use_hybrid = false; 485227068Sambrisko break; 486227068Sambrisko } 487227068Sambrisko } 488247369Ssmh } 489247369Ssmh if (num_procs_with_pcore > 0 && num_procs_with_ecore > 0) { 490227068Sambrisko float multiplier = 60.0 / 40.0; 491227068Sambrisko float p_ratio = (float)num_procs_with_pcore / nproc; 492227068Sambrisko float e_ratio = (float)num_procs_with_ecore / nproc; 493227068Sambrisko float e_multiplier = 494247369Ssmh (float)1 / 495247369Ssmh (((multiplier * num_procs_with_pcore) / nproc) + e_ratio); 496227068Sambrisko float p_multiplier = multiplier * e_multiplier; 497227068Sambrisko p_ntc = __kmp_get_round_val(ntc * p_ratio * p_multiplier); 498247369Ssmh if ((int)p_ntc > (int)(ntc * p_ratio * p_multiplier)) 499247369Ssmh e_ntc = 500227068Sambrisko (int)(__kmp_round_2decimal_val(ntc * e_ratio * e_multiplier)); 501227068Sambrisko else 502227068Sambrisko e_ntc = __kmp_get_round_val(ntc * e_ratio * e_multiplier); 503227068Sambrisko KMP_DEBUG_ASSERT(ntc == p_ntc + e_ntc); 504227068Sambrisko 505247369Ssmh // Use regular static steal if not enough chunks for skewed 506227068Sambrisko // distribution 507227068Sambrisko use_hybrid = (use_hybrid && (p_ntc >= num_procs_with_pcore && 508227068Sambrisko e_ntc >= num_procs_with_ecore) 509227068Sambrisko ? true 510227068Sambrisko : false); 511227068Sambrisko } else { 512227068Sambrisko use_hybrid = false; 513227068Sambrisko } 514227068Sambrisko } 515227068Sambrisko pr->flags.use_hybrid = use_hybrid; 516227068Sambrisko pr->u.p.pchunks = p_ntc; 517227068Sambrisko pr->u.p.num_procs_with_pcore = num_procs_with_pcore; 518227068Sambrisko pr->u.p.first_thread_with_ecore = first_thread_with_ecore; 519227068Sambrisko 520227068Sambrisko if (use_hybrid) { 521227068Sambrisko KMP_DEBUG_ASSERT(nproc == num_procs_with_pcore + num_procs_with_ecore); 522227068Sambrisko T big_chunk = p_ntc / num_procs_with_pcore; 523227068Sambrisko small_chunk = e_ntc / num_procs_with_ecore; 524227068Sambrisko 525227068Sambrisko extras = 526227068Sambrisko (p_ntc % num_procs_with_pcore) + (e_ntc % num_procs_with_ecore); 527227068Sambrisko 528227068Sambrisko p_extra = (big_chunk - small_chunk); 529227068Sambrisko 530242681Sambrisko if (core_type == KMP_HW_CORE_TYPE_CORE) { 531227068Sambrisko if (id < first_thread_with_ecore) { 532227068Sambrisko init = 533227068Sambrisko id * small_chunk + id * p_extra + (id < extras ? id : extras); 534227068Sambrisko } else { 535227068Sambrisko init = id * small_chunk + (id - num_procs_with_ecore) * p_extra + 536227068Sambrisko (id < extras ? id : extras); 537235016Sambrisko } 538235016Sambrisko } else { 539227068Sambrisko if (id == first_thread_with_ecore) { 540227068Sambrisko init = 541227068Sambrisko id * small_chunk + id * p_extra + (id < extras ? id : extras); 542227068Sambrisko } else { 543233711Sambrisko init = id * small_chunk + first_thread_with_ecore * p_extra + 544233711Sambrisko (id < extras ? id : extras); 545227068Sambrisko } 546227068Sambrisko } 547227068Sambrisko p_extra = (core_type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0; 548227068Sambrisko } else 549233711Sambrisko#endif 550227068Sambrisko { 551247369Ssmh small_chunk = ntc / nproc; 552247369Ssmh extras = ntc % nproc; 553233711Sambrisko init = id * small_chunk + (id < extras ? id : extras); 554233711Sambrisko p_extra = 0; 555233711Sambrisko } 556227068Sambrisko pr->u.p.count = init; 557227068Sambrisko if (claimed) { // are we succeeded in claiming own buffer? 558227068Sambrisko pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0); 559227068Sambrisko // Other threads will inspect steal_flag when searching for a victim. 560227068Sambrisko // READY means other threads may steal from this thread from now on. 561227068Sambrisko KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); 562227068Sambrisko } else { 563227068Sambrisko // other thread has stolen whole our range 564227068Sambrisko KMP_DEBUG_ASSERT(pr->steal_flag == THIEF); 565227068Sambrisko pr->u.p.ub = init; // mark there is no iterations to work on 566227068Sambrisko } 567227068Sambrisko pr->u.p.parm2 = ntc; // save number of chunks 568227068Sambrisko // parm3 is the number of times to attempt stealing which is 569227068Sambrisko // nproc (just a heuristics, could be optimized later on). 570227068Sambrisko pr->u.p.parm3 = nproc; 571227068Sambrisko pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 572227068Sambrisko break; 573227068Sambrisko } else { 574235016Sambrisko /* too few chunks: switching to kmp_sch_dynamic_chunked */ 575235016Sambrisko schedule = kmp_sch_dynamic_chunked; 576227068Sambrisko KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to " 577227068Sambrisko "kmp_sch_dynamic_chunked\n", 578227068Sambrisko gtid)); 579233711Sambrisko goto dynamic_init; 580227068Sambrisko break; 581227068Sambrisko } // if 582233711Sambrisko } // case 583227068Sambrisko#endif 584227068Sambrisko case kmp_sch_static_balanced: { 585227068Sambrisko T init, limit; 586247369Ssmh 587227068Sambrisko KD_TRACE( 588227068Sambrisko 100, 589227068Sambrisko ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n", 590227068Sambrisko gtid)); 591227068Sambrisko 592235016Sambrisko if (nproc > 1) { 593227068Sambrisko T id = tid; 594227068Sambrisko 595227068Sambrisko if (tc < nproc) { 596227068Sambrisko if (id < tc) { 597227068Sambrisko init = id; 598227068Sambrisko limit = id; 599227068Sambrisko pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 600227068Sambrisko } else { 601227068Sambrisko pr->u.p.count = 1; /* means no more chunks to execute */ 602227068Sambrisko pr->u.p.parm1 = FALSE; 603227068Sambrisko break; 604247369Ssmh } 605247369Ssmh } else { 606247369Ssmh T small_chunk = tc / nproc; 607247369Ssmh T extras = tc % nproc; 608247369Ssmh init = id * small_chunk + (id < extras ? id : extras); 609227068Sambrisko limit = init + small_chunk - (id < extras ? 0 : 1); 610247369Ssmh pr->u.p.parm1 = (id == nproc - 1); 611247369Ssmh } 612247369Ssmh } else { 613247369Ssmh if (tc > 0) { 614247369Ssmh init = 0; 615247369Ssmh limit = tc - 1; 616247369Ssmh pr->u.p.parm1 = TRUE; 617247369Ssmh } else { 618227068Sambrisko // zero trip count 619247369Ssmh pr->u.p.count = 1; /* means no more chunks to execute */ 620247369Ssmh pr->u.p.parm1 = FALSE; 621247369Ssmh break; 622247369Ssmh } 623227068Sambrisko } 624247369Ssmh#if USE_ITT_BUILD 625247369Ssmh // Calculate chunk for metadata report 626247369Ssmh if (itt_need_metadata_reporting) 627247369Ssmh if (cur_chunk) 628247369Ssmh *cur_chunk = limit - init + 1; 629227068Sambrisko#endif 630247369Ssmh if (st == 1) { 631247369Ssmh pr->u.p.lb = lb + init; 632247369Ssmh pr->u.p.ub = lb + limit; 633247369Ssmh } else { 634247369Ssmh // calculated upper bound, "ub" is user-defined upper bound 635227068Sambrisko T ub_tmp = lb + limit * st; 636227068Sambrisko pr->u.p.lb = lb + init * st; 637235016Sambrisko // adjust upper bound to "ub" if needed, so that MS lastprivate will match 638233711Sambrisko // it exactly 639233711Sambrisko if (st > 0) { 640247369Ssmh pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 641247369Ssmh } else { 642233711Sambrisko pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 643247369Ssmh } 644247369Ssmh } 645247369Ssmh if (pr->flags.ordered) { 646233711Sambrisko pr->u.p.ordered_lower = init; 647233711Sambrisko pr->u.p.ordered_upper = limit; 648227068Sambrisko } 649247369Ssmh break; 650247369Ssmh } // case 651247369Ssmh case kmp_sch_static_balanced_chunked: { 652247369Ssmh // similar to balanced, but chunk adjusted to multiple of simd width 653247369Ssmh T nth = nproc; 654233711Sambrisko KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 655227068Sambrisko " -> falling-through to static_greedy\n", 656235014Sambrisko gtid)); 657235014Sambrisko schedule = kmp_sch_static_greedy; 658227068Sambrisko if (nth > 1) 659227068Sambrisko pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 660247369Ssmh else 661227068Sambrisko pr->u.p.parm1 = tc; 662227068Sambrisko break; 663227068Sambrisko } // case 664227068Sambrisko case kmp_sch_guided_simd: 665227068Sambrisko case kmp_sch_guided_iterative_chunked: { 666227068Sambrisko KD_TRACE( 667227068Sambrisko 100, 668247369Ssmh ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 669227068Sambrisko " case\n", 670227068Sambrisko gtid)); 671227068Sambrisko 672227068Sambrisko if (nproc > 1) { 673227068Sambrisko if ((2L * chunk + 1) * nproc >= tc) { 674227068Sambrisko /* chunk size too large, switch to dynamic */ 675247369Ssmh schedule = kmp_sch_dynamic_chunked; 676227068Sambrisko goto dynamic_init; 677247369Ssmh } else { 678247369Ssmh // when remaining iters become less than parm2 - switch to dynamic 679227068Sambrisko pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 680227068Sambrisko *(double *)&pr->u.p.parm3 = 681227068Sambrisko guided_flt_param / (double)nproc; // may occupy parm3 and parm4 682227068Sambrisko } 683227068Sambrisko } else { 684227068Sambrisko KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 685227068Sambrisko "kmp_sch_static_greedy\n", 686227068Sambrisko gtid)); 687227068Sambrisko schedule = kmp_sch_static_greedy; 688227068Sambrisko /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 689227068Sambrisko KD_TRACE( 690227068Sambrisko 100, 691247369Ssmh ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 692247369Ssmh gtid)); 693247369Ssmh pr->u.p.parm1 = tc; 694247369Ssmh } // if 695227068Sambrisko } // case 696227068Sambrisko break; 697247369Ssmh case kmp_sch_guided_analytical_chunked: { 698247369Ssmh KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 699247369Ssmh "kmp_sch_guided_analytical_chunked case\n", 700247369Ssmh gtid)); 701247369Ssmh 702247369Ssmh if (nproc > 1) { 703227068Sambrisko if ((2L * chunk + 1) * nproc >= tc) { 704227068Sambrisko /* chunk size too large, switch to dynamic */ 705227068Sambrisko schedule = kmp_sch_dynamic_chunked; 706227068Sambrisko goto dynamic_init; 707227068Sambrisko } else { 708235014Sambrisko /* commonly used term: (2 nproc - 1)/(2 nproc) */ 709227068Sambrisko DBL x; 710247369Ssmh 711247369Ssmh#if KMP_USE_X87CONTROL 712242681Sambrisko /* Linux* OS already has 64-bit computation by default for long double, 713242681Sambrisko and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 714242681Sambrisko Windows* OS on IA-32 architecture, we need to set precision to 64-bit 715242681Sambrisko instead of the default 53-bit. Even though long double doesn't work 716247369Ssmh on Windows* OS on Intel(R) 64, the resulting lack of precision is not 717242681Sambrisko expected to impact the correctness of the algorithm, but this has not 718242681Sambrisko been mathematically proven. */ 719247369Ssmh // save original FPCW and set precision to 64-bit, as 720247369Ssmh // Windows* OS on IA-32 architecture defaults to 53-bit 721242681Sambrisko unsigned int oldFpcw = _control87(0, 0); 722242681Sambrisko _control87(_PC_64, _MCW_PC); // 0,0x30000 723242681Sambrisko#endif 724227068Sambrisko /* value used for comparison in solver for cross-over point */ 725227068Sambrisko KMP_ASSERT(tc > 0); 726247369Ssmh long double target = ((long double)chunk * 2 + 1) * nproc / tc; 727227068Sambrisko 728227068Sambrisko /* crossover point--chunk indexes equal to or greater than 729227068Sambrisko this point switch to dynamic-style scheduling */ 730227068Sambrisko UT cross; 731227068Sambrisko 732247369Ssmh /* commonly used term: (2 nproc - 1)/(2 nproc) */ 733247369Ssmh x = 1.0 - 0.5 / (double)nproc; 734227068Sambrisko 735227068Sambrisko#ifdef KMP_DEBUG 736227068Sambrisko { // test natural alignment 737227068Sambrisko struct _test_a { 738227068Sambrisko char a; 739227068Sambrisko union { 740227068Sambrisko char b; 741227068Sambrisko DBL d; 742227068Sambrisko }; 743227068Sambrisko } t; 744227068Sambrisko ptrdiff_t natural_alignment = 745227068Sambrisko (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 746227068Sambrisko //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 747233711Sambrisko // long)natural_alignment ); 748227068Sambrisko KMP_DEBUG_ASSERT( 749227068Sambrisko (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 750227068Sambrisko } 751227068Sambrisko#endif // KMP_DEBUG 752227068Sambrisko 753227068Sambrisko /* save the term in thread private dispatch structure */ 754227068Sambrisko *(DBL *)&pr->u.p.parm3 = x; 755233711Sambrisko 756227068Sambrisko /* solve for the crossover point to the nearest integer i for which C_i 757227068Sambrisko <= chunk */ 758227068Sambrisko { 759227068Sambrisko UT left, right, mid; 760227068Sambrisko long double p; 761235016Sambrisko 762227068Sambrisko /* estimate initial upper and lower bound */ 763227068Sambrisko 764227068Sambrisko /* doesn't matter what value right is as long as it is positive, but 765227068Sambrisko it affects performance of the solver */ 766227068Sambrisko right = 229; 767227068Sambrisko p = __kmp_pow<UT>(x, right); 768235016Sambrisko if (p > target) { 769247369Ssmh do { 770227068Sambrisko p *= p; 771227068Sambrisko right <<= 1; 772227068Sambrisko } while (p > target && right < (1 << 27)); 773227068Sambrisko /* lower bound is previous (failed) estimate of upper bound */ 774227068Sambrisko left = right >> 1; 775247369Ssmh } else { 776247369Ssmh left = 0; 777227068Sambrisko } 778227068Sambrisko 779227068Sambrisko /* bisection root-finding method */ 780227068Sambrisko while (left + 1 < right) { 781247369Ssmh mid = (left + right) / 2; 782247369Ssmh if (__kmp_pow<UT>(x, mid) > target) { 783247369Ssmh left = mid; 784247369Ssmh } else { 785247369Ssmh right = mid; 786227068Sambrisko } 787227068Sambrisko } // while 788227068Sambrisko cross = right; 789227068Sambrisko } 790227068Sambrisko /* assert sanity of computed crossover point */ 791227068Sambrisko KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 792227068Sambrisko __kmp_pow<UT>(x, cross) <= target); 793227068Sambrisko 794227068Sambrisko /* save the crossover point in thread private dispatch structure */ 795227068Sambrisko pr->u.p.parm2 = cross; 796227068Sambrisko 797227068Sambrisko// C75803 798227068Sambrisko#if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 799227068Sambrisko#define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 800227068Sambrisko#else 801227068Sambrisko#define GUIDED_ANALYTICAL_WORKAROUND (x) 802227068Sambrisko#endif 803227068Sambrisko /* dynamic-style scheduling offset */ 804227068Sambrisko pr->u.p.count = tc - 805227068Sambrisko __kmp_dispatch_guided_remaining( 806233711Sambrisko tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 807227068Sambrisko cross * chunk; 808227068Sambrisko#if KMP_USE_X87CONTROL 809227068Sambrisko // restore FPCW 810227068Sambrisko _control87(oldFpcw, _MCW_PC); 811227068Sambrisko#endif 812227068Sambrisko } // if 813227068Sambrisko } else { 814247369Ssmh KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 815227068Sambrisko "kmp_sch_static_greedy\n", 816227068Sambrisko gtid)); 817227068Sambrisko schedule = kmp_sch_static_greedy; 818227068Sambrisko /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 819227068Sambrisko pr->u.p.parm1 = tc; 820227068Sambrisko } // if 821227068Sambrisko } // case 822227068Sambrisko break; 823227068Sambrisko case kmp_sch_static_greedy: 824227068Sambrisko KD_TRACE( 825227068Sambrisko 100, 826227068Sambrisko ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 827227068Sambrisko gtid)); 828227068Sambrisko pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc; 829227068Sambrisko break; 830227068Sambrisko case kmp_sch_static_chunked: 831227068Sambrisko case kmp_sch_dynamic_chunked: 832227068Sambrisko dynamic_init: 833227068Sambrisko if (tc == 0) 834227068Sambrisko break; 835227068Sambrisko if (pr->u.p.parm1 <= 0) 836227068Sambrisko pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 837227068Sambrisko else if (pr->u.p.parm1 > tc) 838227068Sambrisko pr->u.p.parm1 = tc; 839227068Sambrisko // Store the total number of chunks to prevent integer overflow during 840227068Sambrisko // bounds calculations in the get next chunk routine. 841227068Sambrisko pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0); 842227068Sambrisko KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 843227068Sambrisko "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 844227068Sambrisko gtid)); 845227068Sambrisko break; 846227068Sambrisko case kmp_sch_trapezoidal: { 847227068Sambrisko /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 848227068Sambrisko 849227068Sambrisko T parm1, parm2, parm3, parm4; 850227068Sambrisko KD_TRACE(100, 851227068Sambrisko ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n", 852261535Smarkj gtid)); 853261535Smarkj 854227068Sambrisko parm1 = chunk; 855227068Sambrisko 856227068Sambrisko /* F : size of the first cycle */ 857227068Sambrisko parm2 = (tc / (2 * nproc)); 858227068Sambrisko 859227068Sambrisko if (parm2 < 1) { 860227068Sambrisko parm2 = 1; 861227068Sambrisko } 862227068Sambrisko 863227068Sambrisko /* L : size of the last cycle. Make sure the last cycle is not larger 864227068Sambrisko than the first cycle. */ 865227068Sambrisko if (parm1 < 1) { 866242681Sambrisko parm1 = 1; 867242681Sambrisko } else if (parm1 > parm2) { 868242681Sambrisko parm1 = parm2; 869242681Sambrisko } 870242681Sambrisko 871242681Sambrisko /* N : number of cycles */ 872242681Sambrisko parm3 = (parm2 + parm1); 873233711Sambrisko parm3 = (2 * tc + parm3 - 1) / parm3; 874227068Sambrisko 875227068Sambrisko if (parm3 < 2) { 876227068Sambrisko parm3 = 2; 877227068Sambrisko } 878227068Sambrisko 879235016Sambrisko /* sigma : decreasing incr of the trapezoid */ 880235016Sambrisko parm4 = (parm3 - 1); 881235016Sambrisko parm4 = (parm2 - parm1) / parm4; 882227068Sambrisko 883242681Sambrisko // pointless check, because parm4 >= 0 always 884227068Sambrisko // if ( parm4 < 0 ) { 885242681Sambrisko // parm4 = 0; 886242681Sambrisko //} 887242681Sambrisko 888227068Sambrisko pr->u.p.parm1 = parm1; 889242681Sambrisko pr->u.p.parm2 = parm2; 890242681Sambrisko pr->u.p.parm3 = parm3; 891242681Sambrisko pr->u.p.parm4 = parm4; 892242681Sambrisko } // case 893227068Sambrisko break; 894242681Sambrisko 895227068Sambrisko default: { 896242681Sambrisko __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 897242681Sambrisko KMP_HNT(GetNewerLibrary), // Hint 898242681Sambrisko __kmp_msg_null // Variadic argument list terminator 899242681Sambrisko ); 900242681Sambrisko } break; 901242681Sambrisko } // switch 902227068Sambrisko pr->schedule = schedule; 903242681Sambrisko} 904242681Sambrisko 905242681Sambrisko#if KMP_USE_HIER_SCHED 906242681Sambriskotemplate <typename T> 907242681Sambriskoinline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub, 908242681Sambrisko typename traits_t<T>::signed_t st); 909227068Sambriskotemplate <> 910227068Sambriskoinline void 911227068Sambrisko__kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb, 912227068Sambrisko kmp_int32 ub, kmp_int32 st) { 913227068Sambrisko __kmp_dispatch_init_hierarchy<kmp_int32>( 914227068Sambrisko loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 915227068Sambrisko __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 916227068Sambrisko} 917227068Sambriskotemplate <> 918227068Sambriskoinline void 919227068Sambrisko__kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb, 920227068Sambrisko kmp_uint32 ub, kmp_int32 st) { 921227068Sambrisko __kmp_dispatch_init_hierarchy<kmp_uint32>( 922227068Sambrisko loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 923227068Sambrisko __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 924227068Sambrisko} 925227068Sambriskotemplate <> 926227068Sambriskoinline void 927227068Sambrisko__kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb, 928227068Sambrisko kmp_int64 ub, kmp_int64 st) { 929227068Sambrisko __kmp_dispatch_init_hierarchy<kmp_int64>( 930227068Sambrisko loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 931227068Sambrisko __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 932227068Sambrisko} 933227068Sambriskotemplate <> 934242681Sambriskoinline void 935242681Sambrisko__kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb, 936242681Sambrisko kmp_uint64 ub, kmp_int64 st) { 937227068Sambrisko __kmp_dispatch_init_hierarchy<kmp_uint64>( 938227068Sambrisko loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 939227068Sambrisko __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 940227068Sambrisko} 941227068Sambrisko 942227068Sambrisko// free all the hierarchy scheduling memory associated with the team 943233711Sambriskovoid __kmp_dispatch_free_hierarchies(kmp_team_t *team) { 944227068Sambrisko int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 945233711Sambrisko for (int i = 0; i < num_disp_buff; ++i) { 946227068Sambrisko // type does not matter here so use kmp_int32 947227068Sambrisko auto sh = 948261535Smarkj reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( 949227068Sambrisko &team->t.t_disp_buffer[i]); 950227068Sambrisko if (sh->hier) { 951227068Sambrisko sh->hier->deallocate(); 952227068Sambrisko __kmp_free(sh->hier); 953227068Sambrisko } 954227068Sambrisko } 955227068Sambrisko} 956227068Sambrisko#endif 957227068Sambrisko 958227068Sambrisko// UT - unsigned flavor of T, ST - signed flavor of T, 959227068Sambrisko// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 960227068Sambriskotemplate <typename T> 961227068Sambriskostatic void 962233711Sambrisko__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 963227068Sambrisko T ub, typename traits_t<T>::signed_t st, 964227068Sambrisko typename traits_t<T>::signed_t chunk, int push_ws) { 965227068Sambrisko typedef typename traits_t<T>::unsigned_t UT; 966227068Sambrisko 967227068Sambrisko int active; 968227068Sambrisko kmp_info_t *th; 969227068Sambrisko kmp_team_t *team; 970227068Sambrisko kmp_uint32 my_buffer_index; 971227068Sambrisko dispatch_private_info_template<T> *pr; 972261535Smarkj dispatch_shared_info_template<T> volatile *sh; 973261535Smarkj 974261535Smarkj KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 975261535Smarkj sizeof(dispatch_private_info)); 976261535Smarkj KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 977227068Sambrisko sizeof(dispatch_shared_info)); 978227068Sambrisko __kmp_assert_valid_gtid(gtid); 979233711Sambrisko 980233711Sambrisko if (!TCR_4(__kmp_init_parallel)) 981233711Sambrisko __kmp_parallel_initialize(); 982233711Sambrisko 983227068Sambrisko __kmp_resume_if_soft_paused(); 984227068Sambrisko 985227068Sambrisko#if INCLUDE_SSC_MARKS 986227068Sambrisko SSC_MARK_DISPATCH_INIT(); 987227068Sambrisko#endif 988233711Sambrisko#ifdef KMP_DEBUG 989227068Sambrisko typedef typename traits_t<T>::signed_t ST; 990261535Smarkj { 991261535Smarkj char *buff; 992261535Smarkj // create format specifiers before the debug output 993261535Smarkj buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 994261535Smarkj "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 995227068Sambrisko traits_t<ST>::spec, traits_t<T>::spec, 996227068Sambrisko traits_t<T>::spec, traits_t<ST>::spec); 997227068Sambrisko KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 998227068Sambrisko __kmp_str_free(&buff); 999227068Sambrisko } 1000227068Sambrisko#endif 1001227068Sambrisko /* setup data */ 1002227068Sambrisko th = __kmp_threads[gtid]; 1003227068Sambrisko team = th->th.th_team; 1004227068Sambrisko active = !team->t.t_serialized; 1005227068Sambrisko th->th.th_ident = loc; 1006227068Sambrisko 1007227068Sambrisko // Any half-decent optimizer will remove this test when the blocks are empty 1008227068Sambrisko // since the macros expand to nothing 1009261535Smarkj // when statistics are disabled. 1010261535Smarkj if (schedule == __kmp_static) { 1011261535Smarkj KMP_COUNT_BLOCK(OMP_LOOP_STATIC); 1012261535Smarkj } else { 1013261535Smarkj KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC); 1014227068Sambrisko } 1015227068Sambrisko 1016233711Sambrisko#if KMP_USE_HIER_SCHED 1017227068Sambrisko // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable 1018227068Sambrisko // Hierarchical scheduling does not work with ordered, so if ordered is 1019227068Sambrisko // detected, then revert back to threaded scheduling. 1020227068Sambrisko bool ordered; 1021227068Sambrisko enum sched_type my_sched = schedule; 1022233711Sambrisko my_buffer_index = th->th.th_dispatch->th_disp_index; 1023227068Sambrisko pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1024233711Sambrisko &th->th.th_dispatch 1025227068Sambrisko ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 1026261535Smarkj my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched); 1027261535Smarkj if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper)) 1028261535Smarkj my_sched = 1029261535Smarkj (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower)); 1030261535Smarkj ordered = (kmp_ord_lower & my_sched); 1031261535Smarkj if (pr->flags.use_hier) { 1032261535Smarkj if (ordered) { 1033227068Sambrisko KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. " 1034227068Sambrisko "Disabling hierarchical scheduling.\n", 1035227068Sambrisko gtid)); 1036227068Sambrisko pr->flags.use_hier = FALSE; 1037227068Sambrisko } 1038227068Sambrisko } 1039227068Sambrisko if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) { 1040227068Sambrisko // Don't use hierarchical for ordered parallel loops and don't 1041227068Sambrisko // use the runtime hierarchy if one was specified in the program 1042227068Sambrisko if (!ordered && !pr->flags.use_hier) 1043227068Sambrisko __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st); 1044227068Sambrisko } 1045247369Ssmh#endif // KMP_USE_HIER_SCHED 1046247369Ssmh 1047247369Ssmh#if USE_ITT_BUILD 1048227068Sambrisko kmp_uint64 cur_chunk = chunk; 1049227068Sambrisko int itt_need_metadata_reporting = 1050227068Sambrisko __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 1051247369Ssmh KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 1052247369Ssmh team->t.t_active_level == 1; 1053247369Ssmh#endif 1054247369Ssmh if (!active) { 1055247369Ssmh pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1056247369Ssmh th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 1057247369Ssmh } else { 1058247369Ssmh KMP_DEBUG_ASSERT(th->th.th_dispatch == 1059247369Ssmh &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1060227068Sambrisko 1061227068Sambrisko my_buffer_index = th->th.th_dispatch->th_disp_index++; 1062227068Sambrisko 1063227068Sambrisko /* What happens when number of threads changes, need to resize buffer? */ 1064227068Sambrisko pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1065227068Sambrisko &th->th.th_dispatch 1066227068Sambrisko ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 1067227068Sambrisko sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 1068227068Sambrisko &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 1069227068Sambrisko KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid, 1070227068Sambrisko my_buffer_index)); 1071227068Sambrisko if (sh->buffer_index != my_buffer_index) { // too many loops in progress? 1072227068Sambrisko KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d" 1073227068Sambrisko " sh->buffer_index:%d\n", 1074227068Sambrisko gtid, my_buffer_index, sh->buffer_index)); 1075227068Sambrisko __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index, 1076227068Sambrisko __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 1077227068Sambrisko // Note: KMP_WAIT() cannot be used there: buffer index and 1078247369Ssmh // my_buffer_index are *always* 32-bit integers. 1079227068Sambrisko KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 1080227068Sambrisko "sh->buffer_index:%d\n", 1081237546Skevlo gtid, my_buffer_index, sh->buffer_index)); 1082227068Sambrisko } 1083227068Sambrisko } 1084227068Sambrisko 1085227068Sambrisko __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st, 1086227068Sambrisko#if USE_ITT_BUILD 1087227068Sambrisko &cur_chunk, 1088227068Sambrisko#endif 1089227068Sambrisko chunk, (T)th->th.th_team_nproc, 1090227068Sambrisko (T)th->th.th_info.ds.ds_tid); 1091227068Sambrisko if (active) { 1092227068Sambrisko if (pr->flags.ordered == 0) { 1093227068Sambrisko th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 1094247369Ssmh th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 1095227068Sambrisko } else { 1096227068Sambrisko th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 1097227068Sambrisko th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 1098233711Sambrisko } 1099227068Sambrisko th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 1100227068Sambrisko th->th.th_dispatch->th_dispatch_sh_current = 1101227068Sambrisko CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 1102227068Sambrisko#if USE_ITT_BUILD 1103242681Sambrisko if (pr->flags.ordered) { 1104235014Sambrisko __kmp_itt_ordered_init(gtid); 1105227068Sambrisko } 1106227068Sambrisko // Report loop metadata 1107227068Sambrisko if (itt_need_metadata_reporting) { 1108227068Sambrisko // Only report metadata by primary thread of active team at level 1 1109227068Sambrisko kmp_uint64 schedtype = 0; 1110227068Sambrisko switch (schedule) { 1111227068Sambrisko case kmp_sch_static_chunked: 1112227068Sambrisko case kmp_sch_static_balanced: // Chunk is calculated in the switch above 1113227068Sambrisko break; 1114227068Sambrisko case kmp_sch_static_greedy: 1115227068Sambrisko cur_chunk = pr->u.p.parm1; 1116227068Sambrisko break; 1117227068Sambrisko case kmp_sch_dynamic_chunked: 1118227068Sambrisko schedtype = 1; 1119227068Sambrisko break; 1120242681Sambrisko case kmp_sch_guided_iterative_chunked: 1121227068Sambrisko case kmp_sch_guided_analytical_chunked: 1122242681Sambrisko case kmp_sch_guided_simd: 1123227068Sambrisko schedtype = 2; 1124227068Sambrisko break; 1125227068Sambrisko default: 1126227068Sambrisko // Should we put this case under "static"? 1127242681Sambrisko // case kmp_sch_static_steal: 1128247369Ssmh schedtype = 3; 1129247369Ssmh break; 1130242681Sambrisko } 1131242681Sambrisko __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk); 1132242681Sambrisko } 1133242681Sambrisko#if KMP_USE_HIER_SCHED 1134242681Sambrisko if (pr->flags.use_hier) { 1135242681Sambrisko pr->u.p.count = 0; 1136242681Sambrisko pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0; 1137242681Sambrisko } 1138242681Sambrisko#endif // KMP_USER_HIER_SCHED 1139242681Sambrisko#endif /* USE_ITT_BUILD */ 1140242681Sambrisko } 1141227068Sambrisko 1142227068Sambrisko#ifdef KMP_DEBUG 1143227068Sambrisko { 1144227068Sambrisko char *buff; 1145227068Sambrisko // create format specifiers before the debug output 1146227068Sambrisko buff = __kmp_str_format( 1147247369Ssmh "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 1148247369Ssmh "lb:%%%s ub:%%%s" 1149247369Ssmh " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 1150247369Ssmh " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 1151298955Spfg traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 1152247369Ssmh traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1153247369Ssmh traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 1154235014Sambrisko traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 1155227068Sambrisko KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb, 1156227068Sambrisko pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, 1157227068Sambrisko pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 1158242681Sambrisko pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4)); 1159242681Sambrisko __kmp_str_free(&buff); 1160247369Ssmh } 1161247369Ssmh#endif 1162247369Ssmh#if OMPT_SUPPORT && OMPT_OPTIONAL 1163247369Ssmh if (ompt_enabled.ompt_callback_work) { 1164242681Sambrisko ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1165242681Sambrisko ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 1166227068Sambrisko ompt_callbacks.ompt_callback(ompt_callback_work)( 1167235016Sambrisko ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data), 1168247369Ssmh &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid)); 1169247369Ssmh } 1170247369Ssmh#endif 1171235014Sambrisko KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic); 1172227068Sambrisko} 1173235014Sambrisko 1174227068Sambrisko/* For ordered loops, either __kmp_dispatch_finish() should be called after 1175227068Sambrisko * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1176227068Sambrisko * every chunk of iterations. If the ordered section(s) were not executed 1177227068Sambrisko * for this iteration (or every iteration in this chunk), we need to set the 1178227068Sambrisko * ordered iteration counters so that the next thread can proceed. */ 1179235016Sambriskotemplate <typename UT> 1180247369Ssmhstatic void __kmp_dispatch_finish(int gtid, ident_t *loc) { 1181227068Sambrisko typedef typename traits_t<UT>::signed_t ST; 1182233711Sambrisko __kmp_assert_valid_gtid(gtid); 1183247369Ssmh kmp_info_t *th = __kmp_threads[gtid]; 1184227068Sambrisko 1185227068Sambrisko KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 1186233711Sambrisko if (!th->th.th_team->t.t_serialized) { 1187227068Sambrisko 1188227068Sambrisko dispatch_private_info_template<UT> *pr = 1189227068Sambrisko reinterpret_cast<dispatch_private_info_template<UT> *>( 1190227068Sambrisko th->th.th_dispatch->th_dispatch_pr_current); 1191227068Sambrisko dispatch_shared_info_template<UT> volatile *sh = 1192227068Sambrisko reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1193227068Sambrisko th->th.th_dispatch->th_dispatch_sh_current); 1194227068Sambrisko KMP_DEBUG_ASSERT(pr); 1195227068Sambrisko KMP_DEBUG_ASSERT(sh); 1196247369Ssmh KMP_DEBUG_ASSERT(th->th.th_dispatch == 1197247369Ssmh &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1198247369Ssmh 1199227068Sambrisko if (pr->ordered_bumped) { 1200227068Sambrisko KD_TRACE( 1201227068Sambrisko 1000, 1202227068Sambrisko ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1203227068Sambrisko gtid)); 1204247369Ssmh pr->ordered_bumped = 0; 1205247369Ssmh } else { 1206247369Ssmh UT lower = pr->u.p.ordered_lower; 1207247369Ssmh 1208247369Ssmh#ifdef KMP_DEBUG 1209247369Ssmh { 1210247369Ssmh char *buff; 1211247369Ssmh // create format specifiers before the debug output 1212247369Ssmh buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 1213247369Ssmh "ordered_iteration:%%%s lower:%%%s\n", 1214227068Sambrisko traits_t<UT>::spec, traits_t<UT>::spec); 1215227068Sambrisko KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1216247369Ssmh __kmp_str_free(&buff); 1217247369Ssmh } 1218247369Ssmh#endif 1219247369Ssmh 1220247369Ssmh __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1221247369Ssmh __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1222247369Ssmh KMP_MB(); /* is this necessary? */ 1223247369Ssmh#ifdef KMP_DEBUG 1224227068Sambrisko { 1225227068Sambrisko char *buff; 1226227068Sambrisko // create format specifiers before the debug output 1227227068Sambrisko buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1228235016Sambrisko "ordered_iteration:%%%s lower:%%%s\n", 1229247369Ssmh traits_t<UT>::spec, traits_t<UT>::spec); 1230227068Sambrisko KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1231227068Sambrisko __kmp_str_free(&buff); 1232247369Ssmh } 1233227068Sambrisko#endif 1234247369Ssmh 1235227068Sambrisko test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1236227068Sambrisko } // if 1237235016Sambrisko } // if 1238235016Sambrisko KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1239227068Sambrisko} 1240227068Sambrisko 1241227068Sambrisko#ifdef KMP_GOMP_COMPAT 1242227068Sambrisko 1243227068Sambriskotemplate <typename UT> 1244233711Sambriskostatic void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1245227068Sambrisko typedef typename traits_t<UT>::signed_t ST; 1246227068Sambrisko __kmp_assert_valid_gtid(gtid); 1247227068Sambrisko kmp_info_t *th = __kmp_threads[gtid]; 1248227068Sambrisko 1249227068Sambrisko KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1250233711Sambrisko if (!th->th.th_team->t.t_serialized) { 1251227068Sambrisko dispatch_private_info_template<UT> *pr = 1252227068Sambrisko reinterpret_cast<dispatch_private_info_template<UT> *>( 1253227068Sambrisko th->th.th_dispatch->th_dispatch_pr_current); 1254227068Sambrisko dispatch_shared_info_template<UT> volatile *sh = 1255227068Sambrisko reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1256233711Sambrisko th->th.th_dispatch->th_dispatch_sh_current); 1257227068Sambrisko KMP_DEBUG_ASSERT(pr); 1258233711Sambrisko KMP_DEBUG_ASSERT(sh); 1259227068Sambrisko KMP_DEBUG_ASSERT(th->th.th_dispatch == 1260233711Sambrisko &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1261227068Sambrisko 1262227068Sambrisko UT lower = pr->u.p.ordered_lower; 1263227068Sambrisko UT upper = pr->u.p.ordered_upper; 1264227068Sambrisko UT inc = upper - lower + 1; 1265227068Sambrisko 1266233711Sambrisko if (pr->ordered_bumped == inc) { 1267233711Sambrisko KD_TRACE( 1268227068Sambrisko 1000, 1269247369Ssmh ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1270247369Ssmh gtid)); 1271227068Sambrisko pr->ordered_bumped = 0; 1272247369Ssmh } else { 1273247369Ssmh inc -= pr->ordered_bumped; 1274247369Ssmh 1275247369Ssmh#ifdef KMP_DEBUG 1276247369Ssmh { 1277247369Ssmh char *buff; 1278247369Ssmh // create format specifiers before the debug output 1279227068Sambrisko buff = __kmp_str_format( 1280247369Ssmh "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1281247369Ssmh "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1282247369Ssmh traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1283227068Sambrisko KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1284227068Sambrisko __kmp_str_free(&buff); 1285227068Sambrisko } 1286247369Ssmh#endif 1287247369Ssmh 1288247369Ssmh __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1289227068Sambrisko __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1290227068Sambrisko 1291227068Sambrisko KMP_MB(); /* is this necessary? */ 1292247369Ssmh KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1293247369Ssmh "ordered_bumped to zero\n", 1294247369Ssmh gtid)); 1295235014Sambrisko pr->ordered_bumped = 0; 1296235014Sambrisko//!!!!! TODO check if the inc should be unsigned, or signed??? 1297227068Sambrisko#ifdef KMP_DEBUG 1298227068Sambrisko { 1299227068Sambrisko char *buff; 1300227068Sambrisko // create format specifiers before the debug output 1301227068Sambrisko buff = __kmp_str_format( 1302227068Sambrisko "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1303227068Sambrisko "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1304227068Sambrisko traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1305227068Sambrisko traits_t<UT>::spec); 1306227068Sambrisko KD_TRACE(1000, 1307247369Ssmh (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1308247369Ssmh __kmp_str_free(&buff); 1309227068Sambrisko } 1310227068Sambrisko#endif 1311247369Ssmh 1312247369Ssmh test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1313227068Sambrisko } 1314233711Sambrisko // } 1315227068Sambrisko } 1316227068Sambrisko KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1317233711Sambrisko} 1318227068Sambrisko 1319227068Sambrisko#endif /* KMP_GOMP_COMPAT */ 1320227068Sambrisko 1321227068Sambriskotemplate <typename T> 1322227068Sambriskoint __kmp_dispatch_next_algorithm(int gtid, 1323227068Sambrisko dispatch_private_info_template<T> *pr, 1324227068Sambrisko dispatch_shared_info_template<T> volatile *sh, 1325235014Sambrisko kmp_int32 *p_last, T *p_lb, T *p_ub, 1326235014Sambrisko typename traits_t<T>::signed_t *p_st, T nproc, 1327235014Sambrisko T tid) { 1328235016Sambrisko typedef typename traits_t<T>::unsigned_t UT; 1329235014Sambrisko typedef typename traits_t<T>::signed_t ST; 1330235014Sambrisko typedef typename traits_t<T>::floating_t DBL; 1331235014Sambrisko int status = 0; 1332235014Sambrisko bool last = false; 1333235014Sambrisko T start; 1334235014Sambrisko ST incr; 1335235014Sambrisko UT limit, trip, init; 1336235014Sambrisko kmp_info_t *th = __kmp_threads[gtid]; 1337235014Sambrisko kmp_team_t *team = th->th.th_team; 1338235014Sambrisko 1339235014Sambrisko KMP_DEBUG_ASSERT(th->th.th_dispatch == 1340235014Sambrisko &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1341235014Sambrisko KMP_DEBUG_ASSERT(pr); 1342235014Sambrisko KMP_DEBUG_ASSERT(sh); 1343235014Sambrisko KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc); 1344235014Sambrisko#ifdef KMP_DEBUG 1345235014Sambrisko { 1346235014Sambrisko char *buff; 1347235014Sambrisko // create format specifiers before the debug output 1348235014Sambrisko buff = 1349235014Sambrisko __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1350235014Sambrisko "sh:%%p nproc:%%%s tid:%%%s\n", 1351235014Sambrisko traits_t<T>::spec, traits_t<T>::spec); 1352235014Sambrisko KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid)); 1353235014Sambrisko __kmp_str_free(&buff); 1354298955Spfg } 1355235014Sambrisko#endif 1356235014Sambrisko 1357235014Sambrisko // zero trip count 1358235014Sambrisko if (pr->u.p.tc == 0) { 1359235014Sambrisko KD_TRACE(10, 1360235014Sambrisko ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1361235014Sambrisko "zero status:%d\n", 1362247369Ssmh gtid, status)); 1363247369Ssmh return 0; 1364235014Sambrisko } 1365247369Ssmh 1366235014Sambrisko switch (pr->schedule) { 1367235014Sambrisko#if KMP_STATIC_STEAL_ENABLED 1368235014Sambrisko case kmp_sch_static_steal: { 1369235014Sambrisko T chunk = pr->u.p.parm1; 1370235014Sambrisko UT nchunks = pr->u.p.parm2; 1371247369Ssmh KD_TRACE(100, 1372247369Ssmh ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n", 1373235014Sambrisko gtid)); 1374235014Sambrisko 1375235014Sambrisko trip = pr->u.p.tc - 1; 1376235014Sambrisko 1377235014Sambrisko if (traits_t<T>::type_size > 4) { 1378235014Sambrisko // use lock for 8-byte induction variable. 1379235014Sambrisko // TODO (optional): check presence and use 16-byte CAS 1380235014Sambrisko kmp_lock_t *lck = pr->u.p.steal_lock; 1381235014Sambrisko KMP_DEBUG_ASSERT(lck != NULL); 1382247369Ssmh if (pr->u.p.count < (UT)pr->u.p.ub) { 1383235014Sambrisko KMP_DEBUG_ASSERT(pr->steal_flag == READY); 1384235014Sambrisko __kmp_acquire_lock(lck, gtid); 1385235014Sambrisko // try to get own chunk of iterations 1386235014Sambrisko init = (pr->u.p.count)++; 1387235014Sambrisko status = (init < (UT)pr->u.p.ub); 1388235014Sambrisko __kmp_release_lock(lck, gtid); 1389235014Sambrisko } else { 1390235014Sambrisko status = 0; // no own chunks 1391235014Sambrisko } 1392235014Sambrisko if (!status) { // try to steal 1393235014Sambrisko kmp_lock_t *lckv; // victim buffer's lock 1394235014Sambrisko T while_limit = pr->u.p.parm3; 1395235014Sambrisko T while_index = 0; 1396235014Sambrisko int idx = (th->th.th_dispatch->th_disp_index - 1) % 1397247369Ssmh __kmp_dispatch_num_buffers; // current loop index 1398235040Sambrisko // note: victim thread can potentially execute another loop 1399235040Sambrisko KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive 1400235040Sambrisko while ((!status) && (while_limit != ++while_index)) { 1401235040Sambrisko dispatch_private_info_template<T> *v; 1402247369Ssmh T remaining; 1403235014Sambrisko T victimId = pr->u.p.parm4; 1404235014Sambrisko T oldVictimId = victimId ? victimId - 1 : nproc - 1; 1405235040Sambrisko v = reinterpret_cast<dispatch_private_info_template<T> *>( 1406235040Sambrisko &team->t.t_dispatch[victimId].th_disp_buffer[idx]); 1407235040Sambrisko KMP_DEBUG_ASSERT(v); 1408235040Sambrisko while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) && 1409235040Sambrisko oldVictimId != victimId) { 1410242681Sambrisko victimId = (victimId + 1) % nproc; 1411235014Sambrisko v = reinterpret_cast<dispatch_private_info_template<T> *>( 1412235014Sambrisko &team->t.t_dispatch[victimId].th_disp_buffer[idx]); 1413235014Sambrisko KMP_DEBUG_ASSERT(v); 1414235014Sambrisko } 1415235014Sambrisko if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) { 1416235014Sambrisko continue; // try once more (nproc attempts in total) 1417235014Sambrisko } 1418235014Sambrisko if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) { 1419235014Sambrisko kmp_uint32 old = UNUSED; 1420235014Sambrisko // try to steal whole range from inactive victim 1421235014Sambrisko status = v->steal_flag.compare_exchange_strong(old, THIEF); 1422235014Sambrisko if (status) { 1423235014Sambrisko // initialize self buffer with victim's whole range of chunks 1424235014Sambrisko T id = victimId; 1425235014Sambrisko T small_chunk = 0, extras = 0, p_extra = 0; 1426235014Sambrisko __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc, 1427235014Sambrisko init, small_chunk, extras, 1428235014Sambrisko p_extra); 1429235014Sambrisko __kmp_acquire_lock(lck, gtid); 1430235014Sambrisko pr->u.p.count = init + 1; // exclude one we execute immediately 1431235014Sambrisko pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0); 1432235014Sambrisko __kmp_release_lock(lck, gtid); 1433235014Sambrisko pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 1434235014Sambrisko // no need to reinitialize other thread invariants: lb, st, etc. 1435235014Sambrisko#ifdef KMP_DEBUG 1436235014Sambrisko { 1437235014Sambrisko char *buff; 1438235040Sambrisko // create format specifiers before the debug output 1439235040Sambrisko buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " 1440247369Ssmh "stolen chunks from T#%%d, " 1441235040Sambrisko "count:%%%s ub:%%%s\n", 1442235014Sambrisko traits_t<UT>::spec, traits_t<T>::spec); 1443235014Sambrisko KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub)); 1444235014Sambrisko __kmp_str_free(&buff); 1445235014Sambrisko } 1446235014Sambrisko#endif 1447235014Sambrisko // activate non-empty buffer and let others steal from us 1448235014Sambrisko if (pr->u.p.count < (UT)pr->u.p.ub) 1449235014Sambrisko KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); 1450235014Sambrisko break; 1451235014Sambrisko } 1452235014Sambrisko } 1453235014Sambrisko if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY || 1454235014Sambrisko v->u.p.count >= (UT)v->u.p.ub) { 1455235014Sambrisko pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid 1456235014Sambrisko continue; // no chunks to steal, try next victim 1457235014Sambrisko } 1458235014Sambrisko lckv = v->u.p.steal_lock; 1459235014Sambrisko KMP_ASSERT(lckv != NULL); 1460235014Sambrisko __kmp_acquire_lock(lckv, gtid); 1461235014Sambrisko limit = v->u.p.ub; // keep initial ub 1462235014Sambrisko if (v->u.p.count >= limit) { 1463235014Sambrisko __kmp_release_lock(lckv, gtid); 1464235014Sambrisko pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid 1465235014Sambrisko continue; // no chunks to steal, try next victim 1466235014Sambrisko } 1467235014Sambrisko 1468235014Sambrisko // stealing succeded, reduce victim's ub by 1/4 of undone chunks 1469235014Sambrisko // TODO: is this heuristics good enough?? 1470235014Sambrisko remaining = limit - v->u.p.count; 1471235014Sambrisko if (remaining > 7) { 1472235014Sambrisko // steal 1/4 of remaining 1473247369Ssmh KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2); 1474235014Sambrisko init = (v->u.p.ub -= (remaining >> 2)); 1475235014Sambrisko } else { 1476235014Sambrisko // steal 1 chunk of 1..7 remaining 1477235014Sambrisko KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1); 1478247369Ssmh init = (v->u.p.ub -= 1); 1479235014Sambrisko } 1480235014Sambrisko __kmp_release_lock(lckv, gtid); 1481235014Sambrisko#ifdef KMP_DEBUG 1482235014Sambrisko { 1483235014Sambrisko char *buff; 1484235014Sambrisko // create format specifiers before the debug output 1485235014Sambrisko buff = __kmp_str_format( 1486235014Sambrisko "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, " 1487235014Sambrisko "count:%%%s ub:%%%s\n", 1488235014Sambrisko traits_t<UT>::spec, traits_t<UT>::spec); 1489235014Sambrisko KD_TRACE(10, (buff, gtid, victimId, init, limit)); 1490235014Sambrisko __kmp_str_free(&buff); 1491235014Sambrisko } 1492235014Sambrisko#endif 1493235014Sambrisko KMP_DEBUG_ASSERT(init + 1 <= limit); 1494235014Sambrisko pr->u.p.parm4 = victimId; // remember victim to steal from 1495247369Ssmh status = 1; 1496235014Sambrisko // now update own count and ub with stolen range excluding init chunk 1497247369Ssmh __kmp_acquire_lock(lck, gtid); 1498235014Sambrisko pr->u.p.count = init + 1; 1499 pr->u.p.ub = limit; 1500 __kmp_release_lock(lck, gtid); 1501 // activate non-empty buffer and let others steal from us 1502 if (init + 1 < limit) 1503 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); 1504 } // while (search for victim) 1505 } // if (try to find victim and steal) 1506 } else { 1507 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1508 // as all operations on pair (count, ub) must be done atomically 1509 typedef union { 1510 struct { 1511 UT count; 1512 T ub; 1513 } p; 1514 kmp_int64 b; 1515 } union_i4; 1516 union_i4 vold, vnew; 1517 if (pr->u.p.count < (UT)pr->u.p.ub) { 1518 KMP_DEBUG_ASSERT(pr->steal_flag == READY); 1519 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1520 vnew.b = vold.b; 1521 vnew.p.count++; // get chunk from head of self range 1522 while (!KMP_COMPARE_AND_STORE_REL64( 1523 (volatile kmp_int64 *)&pr->u.p.count, 1524 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1525 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1526 KMP_CPU_PAUSE(); 1527 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1528 vnew.b = vold.b; 1529 vnew.p.count++; 1530 } 1531 init = vold.p.count; 1532 status = (init < (UT)vold.p.ub); 1533 } else { 1534 status = 0; // no own chunks 1535 } 1536 if (!status) { // try to steal 1537 T while_limit = pr->u.p.parm3; 1538 T while_index = 0; 1539 int idx = (th->th.th_dispatch->th_disp_index - 1) % 1540 __kmp_dispatch_num_buffers; // current loop index 1541 // note: victim thread can potentially execute another loop 1542 KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive 1543 while ((!status) && (while_limit != ++while_index)) { 1544 dispatch_private_info_template<T> *v; 1545 T remaining; 1546 T victimId = pr->u.p.parm4; 1547 T oldVictimId = victimId ? victimId - 1 : nproc - 1; 1548 v = reinterpret_cast<dispatch_private_info_template<T> *>( 1549 &team->t.t_dispatch[victimId].th_disp_buffer[idx]); 1550 KMP_DEBUG_ASSERT(v); 1551 while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) && 1552 oldVictimId != victimId) { 1553 victimId = (victimId + 1) % nproc; 1554 v = reinterpret_cast<dispatch_private_info_template<T> *>( 1555 &team->t.t_dispatch[victimId].th_disp_buffer[idx]); 1556 KMP_DEBUG_ASSERT(v); 1557 } 1558 if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) { 1559 continue; // try once more (nproc attempts in total) 1560 } 1561 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) { 1562 kmp_uint32 old = UNUSED; 1563 // try to steal whole range from inactive victim 1564 status = v->steal_flag.compare_exchange_strong(old, THIEF); 1565 if (status) { 1566 // initialize self buffer with victim's whole range of chunks 1567 T id = victimId; 1568 T small_chunk = 0, extras = 0, p_extra = 0; 1569 __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc, 1570 init, small_chunk, extras, 1571 p_extra); 1572 vnew.p.count = init + 1; 1573 vnew.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0); 1574 // write pair (count, ub) at once atomically 1575#if KMP_ARCH_X86 1576 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b); 1577#else 1578 *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b; 1579#endif 1580 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 1581 // no need to initialize other thread invariants: lb, st, etc. 1582#ifdef KMP_DEBUG 1583 { 1584 char *buff; 1585 // create format specifiers before the debug output 1586 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " 1587 "stolen chunks from T#%%d, " 1588 "count:%%%s ub:%%%s\n", 1589 traits_t<UT>::spec, traits_t<T>::spec); 1590 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub)); 1591 __kmp_str_free(&buff); 1592 } 1593#endif 1594 // activate non-empty buffer and let others steal from us 1595 if (pr->u.p.count < (UT)pr->u.p.ub) 1596 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); 1597 break; 1598 } 1599 } 1600 while (1) { // CAS loop with check if victim still has enough chunks 1601 // many threads may be stealing concurrently from same victim 1602 vold.b = *(volatile kmp_int64 *)(&v->u.p.count); 1603 if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY || 1604 vold.p.count >= (UT)vold.p.ub) { 1605 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id 1606 break; // no chunks to steal, try next victim 1607 } 1608 vnew.b = vold.b; 1609 remaining = vold.p.ub - vold.p.count; 1610 // try to steal 1/4 of remaining 1611 // TODO: is this heuristics good enough?? 1612 if (remaining > 7) { 1613 vnew.p.ub -= remaining >> 2; // steal from tail of victim's range 1614 } else { 1615 vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining 1616 } 1617 KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip); 1618 if (KMP_COMPARE_AND_STORE_REL64( 1619 (volatile kmp_int64 *)&v->u.p.count, 1620 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1621 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1622 // stealing succedded 1623#ifdef KMP_DEBUG 1624 { 1625 char *buff; 1626 // create format specifiers before the debug output 1627 buff = __kmp_str_format( 1628 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, " 1629 "count:%%%s ub:%%%s\n", 1630 traits_t<T>::spec, traits_t<T>::spec); 1631 KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub)); 1632 __kmp_str_free(&buff); 1633 } 1634#endif 1635 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1636 vold.p.ub - vnew.p.ub); 1637 status = 1; 1638 pr->u.p.parm4 = victimId; // keep victim id 1639 // now update own count and ub 1640 init = vnew.p.ub; 1641 vold.p.count = init + 1; 1642#if KMP_ARCH_X86 1643 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b); 1644#else 1645 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1646#endif 1647 // activate non-empty buffer and let others steal from us 1648 if (vold.p.count < (UT)vold.p.ub) 1649 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); 1650 break; 1651 } // if (check CAS result) 1652 KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt 1653 } // while (try to steal from particular victim) 1654 } // while (search for victim) 1655 } // if (try to find victim and steal) 1656 } // if (4-byte induction variable) 1657 if (!status) { 1658 *p_lb = 0; 1659 *p_ub = 0; 1660 if (p_st != NULL) 1661 *p_st = 0; 1662 } else { 1663 start = pr->u.p.lb; 1664 init *= chunk; 1665 limit = chunk + init - 1; 1666 incr = pr->u.p.st; 1667 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1); 1668 1669 KMP_DEBUG_ASSERT(init <= trip); 1670 // keep track of done chunks for possible early exit from stealing 1671 // TODO: count executed chunks locally with rare update of shared location 1672 // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 1673 if ((last = (limit >= trip)) != 0) 1674 limit = trip; 1675 if (p_st != NULL) 1676 *p_st = incr; 1677 1678 if (incr == 1) { 1679 *p_lb = start + init; 1680 *p_ub = start + limit; 1681 } else { 1682 *p_lb = start + init * incr; 1683 *p_ub = start + limit * incr; 1684 } 1685 } // if 1686 break; 1687 } // case 1688#endif // KMP_STATIC_STEAL_ENABLED 1689 case kmp_sch_static_balanced: { 1690 KD_TRACE( 1691 10, 1692 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n", 1693 gtid)); 1694 /* check if thread has any iteration to do */ 1695 if ((status = !pr->u.p.count) != 0) { 1696 pr->u.p.count = 1; 1697 *p_lb = pr->u.p.lb; 1698 *p_ub = pr->u.p.ub; 1699 last = (pr->u.p.parm1 != 0); 1700 if (p_st != NULL) 1701 *p_st = pr->u.p.st; 1702 } else { /* no iterations to do */ 1703 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1704 } 1705 } // case 1706 break; 1707 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1708 merged here */ 1709 case kmp_sch_static_chunked: { 1710 T parm1; 1711 1712 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1713 "kmp_sch_static_[affinity|chunked] case\n", 1714 gtid)); 1715 parm1 = pr->u.p.parm1; 1716 1717 trip = pr->u.p.tc - 1; 1718 init = parm1 * (pr->u.p.count + tid); 1719 1720 if ((status = (init <= trip)) != 0) { 1721 start = pr->u.p.lb; 1722 incr = pr->u.p.st; 1723 limit = parm1 + init - 1; 1724 1725 if ((last = (limit >= trip)) != 0) 1726 limit = trip; 1727 1728 if (p_st != NULL) 1729 *p_st = incr; 1730 1731 pr->u.p.count += nproc; 1732 1733 if (incr == 1) { 1734 *p_lb = start + init; 1735 *p_ub = start + limit; 1736 } else { 1737 *p_lb = start + init * incr; 1738 *p_ub = start + limit * incr; 1739 } 1740 1741 if (pr->flags.ordered) { 1742 pr->u.p.ordered_lower = init; 1743 pr->u.p.ordered_upper = limit; 1744 } // if 1745 } // if 1746 } // case 1747 break; 1748 1749 case kmp_sch_dynamic_chunked: { 1750 UT chunk_number; 1751 UT chunk_size = pr->u.p.parm1; 1752 UT nchunks = pr->u.p.parm2; 1753 1754 KD_TRACE( 1755 100, 1756 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n", 1757 gtid)); 1758 1759 chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1760 status = (chunk_number < nchunks); 1761 if (!status) { 1762 *p_lb = 0; 1763 *p_ub = 0; 1764 if (p_st != NULL) 1765 *p_st = 0; 1766 } else { 1767 init = chunk_size * chunk_number; 1768 trip = pr->u.p.tc - 1; 1769 start = pr->u.p.lb; 1770 incr = pr->u.p.st; 1771 1772 if ((last = (trip - init < (UT)chunk_size))) 1773 limit = trip; 1774 else 1775 limit = chunk_size + init - 1; 1776 1777 if (p_st != NULL) 1778 *p_st = incr; 1779 1780 if (incr == 1) { 1781 *p_lb = start + init; 1782 *p_ub = start + limit; 1783 } else { 1784 *p_lb = start + init * incr; 1785 *p_ub = start + limit * incr; 1786 } 1787 1788 if (pr->flags.ordered) { 1789 pr->u.p.ordered_lower = init; 1790 pr->u.p.ordered_upper = limit; 1791 } // if 1792 } // if 1793 } // case 1794 break; 1795 1796 case kmp_sch_guided_iterative_chunked: { 1797 T chunkspec = pr->u.p.parm1; 1798 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1799 "iterative case\n", 1800 gtid)); 1801 trip = pr->u.p.tc; 1802 // Start atomic part of calculations 1803 while (1) { 1804 ST remaining; // signed, because can be < 0 1805 init = sh->u.s.iteration; // shared value 1806 remaining = trip - init; 1807 if (remaining <= 0) { // AC: need to compare with 0 first 1808 // nothing to do, don't try atomic op 1809 status = 0; 1810 break; 1811 } 1812 if ((T)remaining < 1813 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1814 // use dynamic-style schedule 1815 // atomically increment iterations, get old value 1816 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1817 (ST)chunkspec); 1818 remaining = trip - init; 1819 if (remaining <= 0) { 1820 status = 0; // all iterations got by other threads 1821 } else { 1822 // got some iterations to work on 1823 status = 1; 1824 if ((T)remaining > chunkspec) { 1825 limit = init + chunkspec - 1; 1826 } else { 1827 last = true; // the last chunk 1828 limit = init + remaining - 1; 1829 } // if 1830 } // if 1831 break; 1832 } // if 1833 limit = init + (UT)((double)remaining * 1834 *(double *)&pr->u.p.parm3); // divide by K*nproc 1835 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1836 (ST)init, (ST)limit)) { 1837 // CAS was successful, chunk obtained 1838 status = 1; 1839 --limit; 1840 break; 1841 } // if 1842 } // while 1843 if (status != 0) { 1844 start = pr->u.p.lb; 1845 incr = pr->u.p.st; 1846 if (p_st != NULL) 1847 *p_st = incr; 1848 *p_lb = start + init * incr; 1849 *p_ub = start + limit * incr; 1850 if (pr->flags.ordered) { 1851 pr->u.p.ordered_lower = init; 1852 pr->u.p.ordered_upper = limit; 1853 } // if 1854 } else { 1855 *p_lb = 0; 1856 *p_ub = 0; 1857 if (p_st != NULL) 1858 *p_st = 0; 1859 } // if 1860 } // case 1861 break; 1862 1863 case kmp_sch_guided_simd: { 1864 // same as iterative but curr-chunk adjusted to be multiple of given 1865 // chunk 1866 T chunk = pr->u.p.parm1; 1867 KD_TRACE(100, 1868 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n", 1869 gtid)); 1870 trip = pr->u.p.tc; 1871 // Start atomic part of calculations 1872 while (1) { 1873 ST remaining; // signed, because can be < 0 1874 init = sh->u.s.iteration; // shared value 1875 remaining = trip - init; 1876 if (remaining <= 0) { // AC: need to compare with 0 first 1877 status = 0; // nothing to do, don't try atomic op 1878 break; 1879 } 1880 KMP_DEBUG_ASSERT(chunk && init % chunk == 0); 1881 // compare with K*nproc*(chunk+1), K=2 by default 1882 if ((T)remaining < pr->u.p.parm2) { 1883 // use dynamic-style schedule 1884 // atomically increment iterations, get old value 1885 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1886 (ST)chunk); 1887 remaining = trip - init; 1888 if (remaining <= 0) { 1889 status = 0; // all iterations got by other threads 1890 } else { 1891 // got some iterations to work on 1892 status = 1; 1893 if ((T)remaining > chunk) { 1894 limit = init + chunk - 1; 1895 } else { 1896 last = true; // the last chunk 1897 limit = init + remaining - 1; 1898 } // if 1899 } // if 1900 break; 1901 } // if 1902 // divide by K*nproc 1903 UT span; 1904 __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3), 1905 &span); 1906 UT rem = span % chunk; 1907 if (rem) // adjust so that span%chunk == 0 1908 span += chunk - rem; 1909 limit = init + span; 1910 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1911 (ST)init, (ST)limit)) { 1912 // CAS was successful, chunk obtained 1913 status = 1; 1914 --limit; 1915 break; 1916 } // if 1917 } // while 1918 if (status != 0) { 1919 start = pr->u.p.lb; 1920 incr = pr->u.p.st; 1921 if (p_st != NULL) 1922 *p_st = incr; 1923 *p_lb = start + init * incr; 1924 *p_ub = start + limit * incr; 1925 if (pr->flags.ordered) { 1926 pr->u.p.ordered_lower = init; 1927 pr->u.p.ordered_upper = limit; 1928 } // if 1929 } else { 1930 *p_lb = 0; 1931 *p_ub = 0; 1932 if (p_st != NULL) 1933 *p_st = 0; 1934 } // if 1935 } // case 1936 break; 1937 1938 case kmp_sch_guided_analytical_chunked: { 1939 T chunkspec = pr->u.p.parm1; 1940 UT chunkIdx; 1941#if KMP_USE_X87CONTROL 1942 /* for storing original FPCW value for Windows* OS on 1943 IA-32 architecture 8-byte version */ 1944 unsigned int oldFpcw; 1945 unsigned int fpcwSet = 0; 1946#endif 1947 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1948 "kmp_sch_guided_analytical_chunked case\n", 1949 gtid)); 1950 1951 trip = pr->u.p.tc; 1952 1953 KMP_DEBUG_ASSERT(nproc > 1); 1954 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip); 1955 1956 while (1) { /* this while loop is a safeguard against unexpected zero 1957 chunk sizes */ 1958 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1959 if (chunkIdx >= (UT)pr->u.p.parm2) { 1960 --trip; 1961 /* use dynamic-style scheduling */ 1962 init = chunkIdx * chunkspec + pr->u.p.count; 1963 /* need to verify init > 0 in case of overflow in the above 1964 * calculation */ 1965 if ((status = (init > 0 && init <= trip)) != 0) { 1966 limit = init + chunkspec - 1; 1967 1968 if ((last = (limit >= trip)) != 0) 1969 limit = trip; 1970 } 1971 break; 1972 } else { 1973/* use exponential-style scheduling */ 1974/* The following check is to workaround the lack of long double precision on 1975 Windows* OS. 1976 This check works around the possible effect that init != 0 for chunkIdx == 0. 1977 */ 1978#if KMP_USE_X87CONTROL 1979 /* If we haven't already done so, save original 1980 FPCW and set precision to 64-bit, as Windows* OS 1981 on IA-32 architecture defaults to 53-bit */ 1982 if (!fpcwSet) { 1983 oldFpcw = _control87(0, 0); 1984 _control87(_PC_64, _MCW_PC); 1985 fpcwSet = 0x30000; 1986 } 1987#endif 1988 if (chunkIdx) { 1989 init = __kmp_dispatch_guided_remaining<T>( 1990 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 1991 KMP_DEBUG_ASSERT(init); 1992 init = trip - init; 1993 } else 1994 init = 0; 1995 limit = trip - __kmp_dispatch_guided_remaining<T>( 1996 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 1997 KMP_ASSERT(init <= limit); 1998 if (init < limit) { 1999 KMP_DEBUG_ASSERT(limit <= trip); 2000 --limit; 2001 status = 1; 2002 break; 2003 } // if 2004 } // if 2005 } // while (1) 2006#if KMP_USE_X87CONTROL 2007 /* restore FPCW if necessary 2008 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 2009 */ 2010 if (fpcwSet && (oldFpcw & fpcwSet)) 2011 _control87(oldFpcw, _MCW_PC); 2012#endif 2013 if (status != 0) { 2014 start = pr->u.p.lb; 2015 incr = pr->u.p.st; 2016 if (p_st != NULL) 2017 *p_st = incr; 2018 *p_lb = start + init * incr; 2019 *p_ub = start + limit * incr; 2020 if (pr->flags.ordered) { 2021 pr->u.p.ordered_lower = init; 2022 pr->u.p.ordered_upper = limit; 2023 } 2024 } else { 2025 *p_lb = 0; 2026 *p_ub = 0; 2027 if (p_st != NULL) 2028 *p_st = 0; 2029 } 2030 } // case 2031 break; 2032 2033 case kmp_sch_trapezoidal: { 2034 UT index; 2035 T parm2 = pr->u.p.parm2; 2036 T parm3 = pr->u.p.parm3; 2037 T parm4 = pr->u.p.parm4; 2038 KD_TRACE(100, 2039 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n", 2040 gtid)); 2041 2042 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 2043 2044 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 2045 trip = pr->u.p.tc - 1; 2046 2047 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 2048 *p_lb = 0; 2049 *p_ub = 0; 2050 if (p_st != NULL) 2051 *p_st = 0; 2052 } else { 2053 start = pr->u.p.lb; 2054 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 2055 incr = pr->u.p.st; 2056 2057 if ((last = (limit >= trip)) != 0) 2058 limit = trip; 2059 2060 if (p_st != NULL) 2061 *p_st = incr; 2062 2063 if (incr == 1) { 2064 *p_lb = start + init; 2065 *p_ub = start + limit; 2066 } else { 2067 *p_lb = start + init * incr; 2068 *p_ub = start + limit * incr; 2069 } 2070 2071 if (pr->flags.ordered) { 2072 pr->u.p.ordered_lower = init; 2073 pr->u.p.ordered_upper = limit; 2074 } // if 2075 } // if 2076 } // case 2077 break; 2078 default: { 2079 status = 0; // to avoid complaints on uninitialized variable use 2080 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 2081 KMP_HNT(GetNewerLibrary), // Hint 2082 __kmp_msg_null // Variadic argument list terminator 2083 ); 2084 } break; 2085 } // switch 2086 if (p_last) 2087 *p_last = last; 2088#ifdef KMP_DEBUG 2089 if (pr->flags.ordered) { 2090 char *buff; 2091 // create format specifiers before the debug output 2092 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " 2093 "ordered_lower:%%%s ordered_upper:%%%s\n", 2094 traits_t<UT>::spec, traits_t<UT>::spec); 2095 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper)); 2096 __kmp_str_free(&buff); 2097 } 2098 { 2099 char *buff; 2100 // create format specifiers before the debug output 2101 buff = __kmp_str_format( 2102 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 2103 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n", 2104 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2105 KMP_DEBUG_ASSERT(p_last); 2106 KMP_DEBUG_ASSERT(p_st); 2107 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st)); 2108 __kmp_str_free(&buff); 2109 } 2110#endif 2111 return status; 2112} 2113 2114/* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 2115 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 2116 is not called. */ 2117#if OMPT_SUPPORT && OMPT_OPTIONAL 2118#define OMPT_LOOP_END \ 2119 if (status == 0) { \ 2120 if (ompt_enabled.ompt_callback_work) { \ 2121 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 2122 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 2123 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 2124 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 2125 &(task_info->task_data), 0, codeptr); \ 2126 } \ 2127 } 2128#define OMPT_LOOP_DISPATCH(lb, ub, st, status) \ 2129 if (ompt_enabled.ompt_callback_dispatch && status) { \ 2130 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 2131 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 2132 ompt_dispatch_chunk_t chunk; \ 2133 ompt_data_t instance = ompt_data_none; \ 2134 OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, st); \ 2135 instance.ptr = &chunk; \ 2136 ompt_callbacks.ompt_callback(ompt_callback_dispatch)( \ 2137 &(team_info->parallel_data), &(task_info->task_data), \ 2138 ompt_dispatch_ws_loop_chunk, instance); \ 2139 } 2140// TODO: implement count 2141#else 2142#define OMPT_LOOP_END // no-op 2143#define OMPT_LOOP_DISPATCH(lb, ub, st, status) // no-op 2144#endif 2145 2146#if KMP_STATS_ENABLED 2147#define KMP_STATS_LOOP_END \ 2148 { \ 2149 kmp_int64 u, l, t, i; \ 2150 l = (kmp_int64)(*p_lb); \ 2151 u = (kmp_int64)(*p_ub); \ 2152 i = (kmp_int64)(pr->u.p.st); \ 2153 if (status == 0) { \ 2154 t = 0; \ 2155 KMP_POP_PARTITIONED_TIMER(); \ 2156 } else if (i == 1) { \ 2157 if (u >= l) \ 2158 t = u - l + 1; \ 2159 else \ 2160 t = 0; \ 2161 } else if (i < 0) { \ 2162 if (l >= u) \ 2163 t = (l - u) / (-i) + 1; \ 2164 else \ 2165 t = 0; \ 2166 } else { \ 2167 if (u >= l) \ 2168 t = (u - l) / i + 1; \ 2169 else \ 2170 t = 0; \ 2171 } \ 2172 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 2173 } 2174#else 2175#define KMP_STATS_LOOP_END /* Nothing */ 2176#endif 2177 2178template <typename T> 2179static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 2180 T *p_lb, T *p_ub, 2181 typename traits_t<T>::signed_t *p_st 2182#if OMPT_SUPPORT && OMPT_OPTIONAL 2183 , 2184 void *codeptr 2185#endif 2186) { 2187 2188 typedef typename traits_t<T>::unsigned_t UT; 2189 typedef typename traits_t<T>::signed_t ST; 2190 // This is potentially slightly misleading, schedule(runtime) will appear here 2191 // even if the actual runtime schedule is static. (Which points out a 2192 // disadvantage of schedule(runtime): even when static scheduling is used it 2193 // costs more than a compile time choice to use static scheduling would.) 2194 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling); 2195 2196 int status; 2197 dispatch_private_info_template<T> *pr; 2198 __kmp_assert_valid_gtid(gtid); 2199 kmp_info_t *th = __kmp_threads[gtid]; 2200 kmp_team_t *team = th->th.th_team; 2201 2202 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 2203 KD_TRACE( 2204 1000, 2205 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n", 2206 gtid, p_lb, p_ub, p_st, p_last)); 2207 2208 if (team->t.t_serialized) { 2209 /* NOTE: serialize this dispatch because we are not at the active level */ 2210 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 2211 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 2212 KMP_DEBUG_ASSERT(pr); 2213 2214 if ((status = (pr->u.p.tc != 0)) == 0) { 2215 *p_lb = 0; 2216 *p_ub = 0; 2217 // if ( p_last != NULL ) 2218 // *p_last = 0; 2219 if (p_st != NULL) 2220 *p_st = 0; 2221 if (__kmp_env_consistency_check) { 2222 if (pr->pushed_ws != ct_none) { 2223 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2224 } 2225 } 2226 } else if (pr->flags.nomerge) { 2227 kmp_int32 last; 2228 T start; 2229 UT limit, trip, init; 2230 ST incr; 2231 T chunk = pr->u.p.parm1; 2232 2233 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 2234 gtid)); 2235 2236 init = chunk * pr->u.p.count++; 2237 trip = pr->u.p.tc - 1; 2238 2239 if ((status = (init <= trip)) == 0) { 2240 *p_lb = 0; 2241 *p_ub = 0; 2242 // if ( p_last != NULL ) 2243 // *p_last = 0; 2244 if (p_st != NULL) 2245 *p_st = 0; 2246 if (__kmp_env_consistency_check) { 2247 if (pr->pushed_ws != ct_none) { 2248 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2249 } 2250 } 2251 } else { 2252 start = pr->u.p.lb; 2253 limit = chunk + init - 1; 2254 incr = pr->u.p.st; 2255 2256 if ((last = (limit >= trip)) != 0) { 2257 limit = trip; 2258#if KMP_OS_WINDOWS 2259 pr->u.p.last_upper = pr->u.p.ub; 2260#endif /* KMP_OS_WINDOWS */ 2261 } 2262 if (p_last != NULL) 2263 *p_last = last; 2264 if (p_st != NULL) 2265 *p_st = incr; 2266 if (incr == 1) { 2267 *p_lb = start + init; 2268 *p_ub = start + limit; 2269 } else { 2270 *p_lb = start + init * incr; 2271 *p_ub = start + limit * incr; 2272 } 2273 2274 if (pr->flags.ordered) { 2275 pr->u.p.ordered_lower = init; 2276 pr->u.p.ordered_upper = limit; 2277#ifdef KMP_DEBUG 2278 { 2279 char *buff; 2280 // create format specifiers before the debug output 2281 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2282 "ordered_lower:%%%s ordered_upper:%%%s\n", 2283 traits_t<UT>::spec, traits_t<UT>::spec); 2284 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2285 pr->u.p.ordered_upper)); 2286 __kmp_str_free(&buff); 2287 } 2288#endif 2289 } // if 2290 } // if 2291 } else { 2292 pr->u.p.tc = 0; 2293 *p_lb = pr->u.p.lb; 2294 *p_ub = pr->u.p.ub; 2295#if KMP_OS_WINDOWS 2296 pr->u.p.last_upper = *p_ub; 2297#endif /* KMP_OS_WINDOWS */ 2298 if (p_last != NULL) 2299 *p_last = TRUE; 2300 if (p_st != NULL) 2301 *p_st = pr->u.p.st; 2302 } // if 2303#ifdef KMP_DEBUG 2304 { 2305 char *buff; 2306 // create format specifiers before the debug output 2307 buff = __kmp_str_format( 2308 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 2309 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 2310 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2311 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, 2312 (p_last ? *p_last : 0), status)); 2313 __kmp_str_free(&buff); 2314 } 2315#endif 2316#if INCLUDE_SSC_MARKS 2317 SSC_MARK_DISPATCH_NEXT(); 2318#endif 2319 OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status); 2320 OMPT_LOOP_END; 2321 KMP_STATS_LOOP_END; 2322 return status; 2323 } else { 2324 kmp_int32 last = 0; 2325 dispatch_shared_info_template<T> volatile *sh; 2326 2327 KMP_DEBUG_ASSERT(th->th.th_dispatch == 2328 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 2329 2330 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 2331 th->th.th_dispatch->th_dispatch_pr_current); 2332 KMP_DEBUG_ASSERT(pr); 2333 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 2334 th->th.th_dispatch->th_dispatch_sh_current); 2335 KMP_DEBUG_ASSERT(sh); 2336 2337#if KMP_USE_HIER_SCHED 2338 if (pr->flags.use_hier) 2339 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st); 2340 else 2341#endif // KMP_USE_HIER_SCHED 2342 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub, 2343 p_st, th->th.th_team_nproc, 2344 th->th.th_info.ds.ds_tid); 2345 // status == 0: no more iterations to execute 2346 if (status == 0) { 2347 ST num_done; 2348 num_done = test_then_inc<ST>(&sh->u.s.num_done); 2349#ifdef KMP_DEBUG 2350 { 2351 char *buff; 2352 // create format specifiers before the debug output 2353 buff = __kmp_str_format( 2354 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2355 traits_t<ST>::spec); 2356 KD_TRACE(10, (buff, gtid, sh->u.s.num_done)); 2357 __kmp_str_free(&buff); 2358 } 2359#endif 2360 2361#if KMP_USE_HIER_SCHED 2362 pr->flags.use_hier = FALSE; 2363#endif 2364 if (num_done == th->th.th_team_nproc - 1) { 2365#if KMP_STATIC_STEAL_ENABLED 2366 if (pr->schedule == kmp_sch_static_steal) { 2367 int i; 2368 int idx = (th->th.th_dispatch->th_disp_index - 1) % 2369 __kmp_dispatch_num_buffers; // current loop index 2370 // loop complete, safe to destroy locks used for stealing 2371 for (i = 0; i < th->th.th_team_nproc; ++i) { 2372 dispatch_private_info_template<T> *buf = 2373 reinterpret_cast<dispatch_private_info_template<T> *>( 2374 &team->t.t_dispatch[i].th_disp_buffer[idx]); 2375 KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive 2376 KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED); 2377 if (traits_t<T>::type_size > 4) { 2378 // destroy locks used for stealing 2379 kmp_lock_t *lck = buf->u.p.steal_lock; 2380 KMP_ASSERT(lck != NULL); 2381 __kmp_destroy_lock(lck); 2382 __kmp_free(lck); 2383 buf->u.p.steal_lock = NULL; 2384 } 2385 } 2386 } 2387#endif 2388 /* NOTE: release shared buffer to be reused */ 2389 2390 KMP_MB(); /* Flush all pending memory write invalidates. */ 2391 2392 sh->u.s.num_done = 0; 2393 sh->u.s.iteration = 0; 2394 2395 /* TODO replace with general release procedure? */ 2396 if (pr->flags.ordered) { 2397 sh->u.s.ordered_iteration = 0; 2398 } 2399 2400 sh->buffer_index += __kmp_dispatch_num_buffers; 2401 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2402 gtid, sh->buffer_index)); 2403 2404 KMP_MB(); /* Flush all pending memory write invalidates. */ 2405 2406 } // if 2407 if (__kmp_env_consistency_check) { 2408 if (pr->pushed_ws != ct_none) { 2409 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2410 } 2411 } 2412 2413 th->th.th_dispatch->th_deo_fcn = NULL; 2414 th->th.th_dispatch->th_dxo_fcn = NULL; 2415 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2416 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2417 } // if (status == 0) 2418#if KMP_OS_WINDOWS 2419 else if (last) { 2420 pr->u.p.last_upper = pr->u.p.ub; 2421 } 2422#endif /* KMP_OS_WINDOWS */ 2423 if (p_last != NULL && status != 0) 2424 *p_last = last; 2425 } // if 2426 2427#ifdef KMP_DEBUG 2428 { 2429 char *buff; 2430 // create format specifiers before the debug output 2431 buff = __kmp_str_format( 2432 "__kmp_dispatch_next: T#%%d normal case: " 2433 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n", 2434 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2435 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, 2436 (p_last ? *p_last : 0), status)); 2437 __kmp_str_free(&buff); 2438 } 2439#endif 2440#if INCLUDE_SSC_MARKS 2441 SSC_MARK_DISPATCH_NEXT(); 2442#endif 2443 OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status); 2444 OMPT_LOOP_END; 2445 KMP_STATS_LOOP_END; 2446 return status; 2447} 2448 2449/*! 2450@ingroup WORK_SHARING 2451@param loc source location information 2452@param global_tid global thread number 2453@return Zero if the parallel region is not active and this thread should execute 2454all sections, non-zero otherwise. 2455 2456Beginning of sections construct. 2457There are no implicit barriers in the "sections" calls, rather the compiler 2458should introduce an explicit barrier if it is required. 2459 2460This implementation is based on __kmp_dispatch_init, using same constructs for 2461shared data (we can't have sections nested directly in omp for loop, there 2462should be a parallel region in between) 2463*/ 2464kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid) { 2465 2466 int active; 2467 kmp_info_t *th; 2468 kmp_team_t *team; 2469 kmp_uint32 my_buffer_index; 2470 dispatch_shared_info_template<kmp_int32> volatile *sh; 2471 2472 KMP_DEBUG_ASSERT(__kmp_init_serial); 2473 2474 if (!TCR_4(__kmp_init_parallel)) 2475 __kmp_parallel_initialize(); 2476 __kmp_resume_if_soft_paused(); 2477 2478 /* setup data */ 2479 th = __kmp_threads[gtid]; 2480 team = th->th.th_team; 2481 active = !team->t.t_serialized; 2482 th->th.th_ident = loc; 2483 2484 KMP_COUNT_BLOCK(OMP_SECTIONS); 2485 KD_TRACE(10, ("__kmpc_sections: called by T#%d\n", gtid)); 2486 2487 if (active) { 2488 // Setup sections in the same way as dynamic scheduled loops. 2489 // We need one shared data: which section is to execute next. 2490 // (in case parallel is not active, all sections will be executed on the 2491 // same thread) 2492 KMP_DEBUG_ASSERT(th->th.th_dispatch == 2493 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 2494 2495 my_buffer_index = th->th.th_dispatch->th_disp_index++; 2496 2497 // reuse shared data structures from dynamic sched loops: 2498 sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( 2499 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 2500 KD_TRACE(10, ("__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid, 2501 my_buffer_index)); 2502 2503 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 2504 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 2505 2506 KD_TRACE(100, ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d " 2507 "sh->buffer_index:%d\n", 2508 gtid, my_buffer_index, sh->buffer_index)); 2509 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index, 2510 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 2511 // Note: KMP_WAIT() cannot be used there: buffer index and 2512 // my_buffer_index are *always* 32-bit integers. 2513 KMP_MB(); 2514 KD_TRACE(100, ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d " 2515 "sh->buffer_index:%d\n", 2516 gtid, my_buffer_index, sh->buffer_index)); 2517 2518 th->th.th_dispatch->th_dispatch_pr_current = 2519 nullptr; // sections construct doesn't need private data 2520 th->th.th_dispatch->th_dispatch_sh_current = 2521 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 2522 } 2523 2524#if OMPT_SUPPORT && OMPT_OPTIONAL 2525 if (ompt_enabled.ompt_callback_work) { 2526 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 2527 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2528 ompt_callbacks.ompt_callback(ompt_callback_work)( 2529 ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data), 2530 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0)); 2531 } 2532#endif 2533 KMP_PUSH_PARTITIONED_TIMER(OMP_sections); 2534 2535 return active; 2536} 2537 2538/*! 2539@ingroup WORK_SHARING 2540@param loc source location information 2541@param global_tid global thread number 2542@param numberOfSections number of sections in the 'sections' construct 2543@return unsigned [from 0 to n) - number (id) of the section to execute next on 2544this thread. n (or any other number not in range) - nothing to execute on this 2545thread 2546*/ 2547 2548kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid, 2549 kmp_int32 numberOfSections) { 2550 2551 KMP_TIME_PARTITIONED_BLOCK(OMP_sections_overhead); 2552 2553 kmp_info_t *th = __kmp_threads[gtid]; 2554#ifdef KMP_DEBUG 2555 kmp_team_t *team = th->th.th_team; 2556#endif 2557 2558 KD_TRACE(1000, ("__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid, 2559 numberOfSections)); 2560 2561 // For serialized case we should not call this function: 2562 KMP_DEBUG_ASSERT(!team->t.t_serialized); 2563 2564 dispatch_shared_info_template<kmp_int32> volatile *sh; 2565 2566 KMP_DEBUG_ASSERT(th->th.th_dispatch == 2567 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 2568 2569 KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current)); 2570 sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( 2571 th->th.th_dispatch->th_dispatch_sh_current); 2572 KMP_DEBUG_ASSERT(sh); 2573 2574 kmp_int32 sectionIndex = 0; 2575 bool moreSectionsToExecute = true; 2576 2577 // Find section to execute: 2578 sectionIndex = test_then_inc<kmp_int32>((kmp_int32 *)&sh->u.s.iteration); 2579 if (sectionIndex >= numberOfSections) { 2580 moreSectionsToExecute = false; 2581 } 2582 2583 // status == 0: no more sections to execute; 2584 // OMPTODO: __kmpc_end_sections could be bypassed? 2585 if (!moreSectionsToExecute) { 2586 kmp_int32 num_done; 2587 2588 num_done = test_then_inc<kmp_int32>((kmp_int32 *)(&sh->u.s.num_done)); 2589 2590 if (num_done == th->th.th_team_nproc - 1) { 2591 /* NOTE: release this buffer to be reused */ 2592 2593 KMP_MB(); /* Flush all pending memory write invalidates. */ 2594 2595 sh->u.s.num_done = 0; 2596 sh->u.s.iteration = 0; 2597 2598 KMP_MB(); /* Flush all pending memory write invalidates. */ 2599 2600 sh->buffer_index += __kmp_dispatch_num_buffers; 2601 KD_TRACE(100, ("__kmpc_next_section: T#%d change buffer_index:%d\n", gtid, 2602 sh->buffer_index)); 2603 2604 KMP_MB(); /* Flush all pending memory write invalidates. */ 2605 2606 } // if 2607 2608 th->th.th_dispatch->th_deo_fcn = NULL; 2609 th->th.th_dispatch->th_dxo_fcn = NULL; 2610 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2611 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2612 2613#if OMPT_SUPPORT && OMPT_OPTIONAL 2614 if (ompt_enabled.ompt_callback_dispatch) { 2615 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 2616 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2617 ompt_data_t instance = ompt_data_none; 2618 instance.ptr = OMPT_GET_RETURN_ADDRESS(0); 2619 ompt_callbacks.ompt_callback(ompt_callback_dispatch)( 2620 &(team_info->parallel_data), &(task_info->task_data), 2621 ompt_dispatch_section, instance); 2622 } 2623#endif 2624 } 2625 2626 return sectionIndex; 2627} 2628 2629/*! 2630@ingroup WORK_SHARING 2631@param loc source location information 2632@param global_tid global thread number 2633 2634End of "sections" construct. 2635Don't need to wait here: barrier is added separately when needed. 2636*/ 2637void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid) { 2638 2639 kmp_info_t *th = __kmp_threads[gtid]; 2640 int active = !th->th.th_team->t.t_serialized; 2641 2642 KD_TRACE(100, ("__kmpc_end_sections: T#%d called\n", gtid)); 2643 2644 if (!active) { 2645 // In active case call finalization is done in __kmpc_next_section 2646#if OMPT_SUPPORT && OMPT_OPTIONAL 2647 if (ompt_enabled.ompt_callback_work) { 2648 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 2649 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2650 ompt_callbacks.ompt_callback(ompt_callback_work)( 2651 ompt_work_sections, ompt_scope_end, &(team_info->parallel_data), 2652 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0)); 2653 } 2654#endif 2655 } 2656 2657 KMP_POP_PARTITIONED_TIMER(); 2658 KD_TRACE(100, ("__kmpc_end_sections: T#%d returned\n", gtid)); 2659} 2660 2661template <typename T> 2662static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2663 kmp_int32 *plastiter, T *plower, T *pupper, 2664 typename traits_t<T>::signed_t incr) { 2665 typedef typename traits_t<T>::unsigned_t UT; 2666 kmp_uint32 team_id; 2667 kmp_uint32 nteams; 2668 UT trip_count; 2669 kmp_team_t *team; 2670 kmp_info_t *th; 2671 2672 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2673 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2674#ifdef KMP_DEBUG 2675 typedef typename traits_t<T>::signed_t ST; 2676 { 2677 char *buff; 2678 // create format specifiers before the debug output 2679 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2680 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2681 traits_t<T>::spec, traits_t<T>::spec, 2682 traits_t<ST>::spec, traits_t<T>::spec); 2683 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2684 __kmp_str_free(&buff); 2685 } 2686#endif 2687 2688 if (__kmp_env_consistency_check) { 2689 if (incr == 0) { 2690 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2691 loc); 2692 } 2693 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2694 // The loop is illegal. 2695 // Some zero-trip loops maintained by compiler, e.g.: 2696 // for(i=10;i<0;++i) // lower >= upper - run-time check 2697 // for(i=0;i>10;--i) // lower <= upper - run-time check 2698 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2699 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2700 // Compiler does not check the following illegal loops: 2701 // for(i=0;i<10;i+=incr) // where incr<0 2702 // for(i=10;i>0;i-=incr) // where incr<0 2703 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2704 } 2705 } 2706 __kmp_assert_valid_gtid(gtid); 2707 th = __kmp_threads[gtid]; 2708 team = th->th.th_team; 2709 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2710 nteams = th->th.th_teams_size.nteams; 2711 team_id = team->t.t_master_tid; 2712 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc); 2713 2714 // compute global trip count 2715 if (incr == 1) { 2716 trip_count = *pupper - *plower + 1; 2717 } else if (incr == -1) { 2718 trip_count = *plower - *pupper + 1; 2719 } else if (incr > 0) { 2720 // upper-lower can exceed the limit of signed type 2721 trip_count = (UT)(*pupper - *plower) / incr + 1; 2722 } else { 2723 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2724 } 2725 2726 if (trip_count <= nteams) { 2727 KMP_DEBUG_ASSERT( 2728 __kmp_static == kmp_sch_static_greedy || 2729 __kmp_static == 2730 kmp_sch_static_balanced); // Unknown static scheduling type. 2731 // only some teams get single iteration, others get nothing 2732 if (team_id < trip_count) { 2733 *pupper = *plower = *plower + team_id * incr; 2734 } else { 2735 *plower = *pupper + incr; // zero-trip loop 2736 } 2737 if (plastiter != NULL) 2738 *plastiter = (team_id == trip_count - 1); 2739 } else { 2740 if (__kmp_static == kmp_sch_static_balanced) { 2741 UT chunk = trip_count / nteams; 2742 UT extras = trip_count % nteams; 2743 *plower += 2744 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2745 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2746 if (plastiter != NULL) 2747 *plastiter = (team_id == nteams - 1); 2748 } else { 2749 T chunk_inc_count = 2750 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2751 T upper = *pupper; 2752 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2753 // Unknown static scheduling type. 2754 *plower += team_id * chunk_inc_count; 2755 *pupper = *plower + chunk_inc_count - incr; 2756 // Check/correct bounds if needed 2757 if (incr > 0) { 2758 if (*pupper < *plower) 2759 *pupper = traits_t<T>::max_value; 2760 if (plastiter != NULL) 2761 *plastiter = *plower <= upper && *pupper > upper - incr; 2762 if (*pupper > upper) 2763 *pupper = upper; // tracker C73258 2764 } else { 2765 if (*pupper > *plower) 2766 *pupper = traits_t<T>::min_value; 2767 if (plastiter != NULL) 2768 *plastiter = *plower >= upper && *pupper < upper - incr; 2769 if (*pupper < upper) 2770 *pupper = upper; // tracker C73258 2771 } 2772 } 2773 } 2774} 2775 2776//----------------------------------------------------------------------------- 2777// Dispatch routines 2778// Transfer call to template< type T > 2779// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2780// T lb, T ub, ST st, ST chunk ) 2781extern "C" { 2782 2783/*! 2784@ingroup WORK_SHARING 2785@{ 2786@param loc Source location 2787@param gtid Global thread id 2788@param schedule Schedule type 2789@param lb Lower bound 2790@param ub Upper bound 2791@param st Step (or increment if you prefer) 2792@param chunk The chunk size to block with 2793 2794This function prepares the runtime to start a dynamically scheduled for loop, 2795saving the loop arguments. 2796These functions are all identical apart from the types of the arguments. 2797*/ 2798 2799void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2800 enum sched_type schedule, kmp_int32 lb, 2801 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2802 KMP_DEBUG_ASSERT(__kmp_init_serial); 2803#if OMPT_SUPPORT && OMPT_OPTIONAL 2804 OMPT_STORE_RETURN_ADDRESS(gtid); 2805#endif 2806 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2807} 2808/*! 2809See @ref __kmpc_dispatch_init_4 2810*/ 2811void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2812 enum sched_type schedule, kmp_uint32 lb, 2813 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2814 KMP_DEBUG_ASSERT(__kmp_init_serial); 2815#if OMPT_SUPPORT && OMPT_OPTIONAL 2816 OMPT_STORE_RETURN_ADDRESS(gtid); 2817#endif 2818 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2819} 2820 2821/*! 2822See @ref __kmpc_dispatch_init_4 2823*/ 2824void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2825 enum sched_type schedule, kmp_int64 lb, 2826 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2827 KMP_DEBUG_ASSERT(__kmp_init_serial); 2828#if OMPT_SUPPORT && OMPT_OPTIONAL 2829 OMPT_STORE_RETURN_ADDRESS(gtid); 2830#endif 2831 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2832} 2833 2834/*! 2835See @ref __kmpc_dispatch_init_4 2836*/ 2837void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2838 enum sched_type schedule, kmp_uint64 lb, 2839 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2840 KMP_DEBUG_ASSERT(__kmp_init_serial); 2841#if OMPT_SUPPORT && OMPT_OPTIONAL 2842 OMPT_STORE_RETURN_ADDRESS(gtid); 2843#endif 2844 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2845} 2846 2847/*! 2848See @ref __kmpc_dispatch_init_4 2849 2850Difference from __kmpc_dispatch_init set of functions is these functions 2851are called for composite distribute parallel for construct. Thus before 2852regular iterations dispatching we need to calc per-team iteration space. 2853 2854These functions are all identical apart from the types of the arguments. 2855*/ 2856void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2857 enum sched_type schedule, kmp_int32 *p_last, 2858 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2859 kmp_int32 chunk) { 2860 KMP_DEBUG_ASSERT(__kmp_init_serial); 2861#if OMPT_SUPPORT && OMPT_OPTIONAL 2862 OMPT_STORE_RETURN_ADDRESS(gtid); 2863#endif 2864 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2865 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2866} 2867 2868void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2869 enum sched_type schedule, kmp_int32 *p_last, 2870 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2871 kmp_int32 chunk) { 2872 KMP_DEBUG_ASSERT(__kmp_init_serial); 2873#if OMPT_SUPPORT && OMPT_OPTIONAL 2874 OMPT_STORE_RETURN_ADDRESS(gtid); 2875#endif 2876 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2877 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2878} 2879 2880void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2881 enum sched_type schedule, kmp_int32 *p_last, 2882 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2883 kmp_int64 chunk) { 2884 KMP_DEBUG_ASSERT(__kmp_init_serial); 2885#if OMPT_SUPPORT && OMPT_OPTIONAL 2886 OMPT_STORE_RETURN_ADDRESS(gtid); 2887#endif 2888 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2889 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2890} 2891 2892void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2893 enum sched_type schedule, kmp_int32 *p_last, 2894 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2895 kmp_int64 chunk) { 2896 KMP_DEBUG_ASSERT(__kmp_init_serial); 2897#if OMPT_SUPPORT && OMPT_OPTIONAL 2898 OMPT_STORE_RETURN_ADDRESS(gtid); 2899#endif 2900 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2901 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2902} 2903 2904/*! 2905@param loc Source code location 2906@param gtid Global thread id 2907@param p_last Pointer to a flag set to one if this is the last chunk or zero 2908otherwise 2909@param p_lb Pointer to the lower bound for the next chunk of work 2910@param p_ub Pointer to the upper bound for the next chunk of work 2911@param p_st Pointer to the stride for the next chunk of work 2912@return one if there is work to be done, zero otherwise 2913 2914Get the next dynamically allocated chunk of work for this thread. 2915If there is no more work, then the lb,ub and stride need not be modified. 2916*/ 2917int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2918 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2919#if OMPT_SUPPORT && OMPT_OPTIONAL 2920 OMPT_STORE_RETURN_ADDRESS(gtid); 2921#endif 2922 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st 2923#if OMPT_SUPPORT && OMPT_OPTIONAL 2924 , 2925 OMPT_LOAD_RETURN_ADDRESS(gtid) 2926#endif 2927 ); 2928} 2929 2930/*! 2931See @ref __kmpc_dispatch_next_4 2932*/ 2933int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2934 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2935 kmp_int32 *p_st) { 2936#if OMPT_SUPPORT && OMPT_OPTIONAL 2937 OMPT_STORE_RETURN_ADDRESS(gtid); 2938#endif 2939 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st 2940#if OMPT_SUPPORT && OMPT_OPTIONAL 2941 , 2942 OMPT_LOAD_RETURN_ADDRESS(gtid) 2943#endif 2944 ); 2945} 2946 2947/*! 2948See @ref __kmpc_dispatch_next_4 2949*/ 2950int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2951 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2952#if OMPT_SUPPORT && OMPT_OPTIONAL 2953 OMPT_STORE_RETURN_ADDRESS(gtid); 2954#endif 2955 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st 2956#if OMPT_SUPPORT && OMPT_OPTIONAL 2957 , 2958 OMPT_LOAD_RETURN_ADDRESS(gtid) 2959#endif 2960 ); 2961} 2962 2963/*! 2964See @ref __kmpc_dispatch_next_4 2965*/ 2966int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2967 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2968 kmp_int64 *p_st) { 2969#if OMPT_SUPPORT && OMPT_OPTIONAL 2970 OMPT_STORE_RETURN_ADDRESS(gtid); 2971#endif 2972 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st 2973#if OMPT_SUPPORT && OMPT_OPTIONAL 2974 , 2975 OMPT_LOAD_RETURN_ADDRESS(gtid) 2976#endif 2977 ); 2978} 2979 2980/*! 2981@param loc Source code location 2982@param gtid Global thread id 2983 2984Mark the end of a dynamic loop. 2985*/ 2986void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2987 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2988} 2989 2990/*! 2991See @ref __kmpc_dispatch_fini_4 2992*/ 2993void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2994 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2995} 2996 2997/*! 2998See @ref __kmpc_dispatch_fini_4 2999*/ 3000void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 3001 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 3002} 3003 3004/*! 3005See @ref __kmpc_dispatch_fini_4 3006*/ 3007void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 3008 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 3009} 3010/*! @} */ 3011 3012//----------------------------------------------------------------------------- 3013// Non-template routines from kmp_dispatch.cpp used in other sources 3014 3015kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 3016 return value == checker; 3017} 3018 3019kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 3020 return value != checker; 3021} 3022 3023kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 3024 return value < checker; 3025} 3026 3027kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 3028 return value >= checker; 3029} 3030 3031kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 3032 return value <= checker; 3033} 3034 3035kmp_uint32 3036__kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 3037 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 3038 void *obj // Higher-level synchronization object, or NULL. 3039) { 3040 // note: we may not belong to a team at this point 3041 volatile kmp_uint32 *spin = spinner; 3042 kmp_uint32 check = checker; 3043 kmp_uint32 spins; 3044 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 3045 kmp_uint32 r; 3046 kmp_uint64 time; 3047 3048 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 3049 KMP_INIT_YIELD(spins); 3050 KMP_INIT_BACKOFF(time); 3051 // main wait spin loop 3052 while (!f(r = TCR_4(*spin), check)) { 3053 KMP_FSYNC_SPIN_PREPARE(obj); 3054 /* GEH - remove this since it was accidentally introduced when kmp_wait was 3055 split. It causes problems with infinite recursion because of exit lock */ 3056 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 3057 __kmp_abort_thread(); */ 3058 KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); 3059 } 3060 KMP_FSYNC_SPIN_ACQUIRED(obj); 3061 return r; 3062} 3063 3064void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker, 3065 kmp_uint32 (*pred)(void *, kmp_uint32), 3066 void *obj // Higher-level synchronization object, or NULL. 3067) { 3068 // note: we may not belong to a team at this point 3069 void *spin = spinner; 3070 kmp_uint32 check = checker; 3071 kmp_uint32 spins; 3072 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 3073 kmp_uint64 time; 3074 3075 KMP_FSYNC_SPIN_INIT(obj, spin); 3076 KMP_INIT_YIELD(spins); 3077 KMP_INIT_BACKOFF(time); 3078 // main wait spin loop 3079 while (!f(spin, check)) { 3080 KMP_FSYNC_SPIN_PREPARE(obj); 3081 /* if we have waited a bit, or are noversubscribed, yield */ 3082 /* pause is in the following code */ 3083 KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); 3084 } 3085 KMP_FSYNC_SPIN_ACQUIRED(obj); 3086} 3087 3088} // extern "C" 3089 3090#ifdef KMP_GOMP_COMPAT 3091 3092void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 3093 enum sched_type schedule, kmp_int32 lb, 3094 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 3095 int push_ws) { 3096 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 3097 push_ws); 3098} 3099 3100void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 3101 enum sched_type schedule, kmp_uint32 lb, 3102 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 3103 int push_ws) { 3104 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 3105 push_ws); 3106} 3107 3108void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 3109 enum sched_type schedule, kmp_int64 lb, 3110 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 3111 int push_ws) { 3112 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 3113 push_ws); 3114} 3115 3116void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 3117 enum sched_type schedule, kmp_uint64 lb, 3118 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 3119 int push_ws) { 3120 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 3121 push_ws); 3122} 3123 3124void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 3125 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 3126} 3127 3128void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 3129 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 3130} 3131 3132void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 3133 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 3134} 3135 3136void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 3137 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 3138} 3139 3140#endif /* KMP_GOMP_COMPAT */ 3141 3142/* ------------------------------------------------------------------------ */ 3143