1/*
2 * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef KMP_DISPATCH_H
14#define KMP_DISPATCH_H
15
16/* ------------------------------------------------------------------------ */
17/* ------------------------------------------------------------------------ */
18
19#include "kmp.h"
20#include "kmp_error.h"
21#include "kmp_i18n.h"
22#include "kmp_itt.h"
23#include "kmp_stats.h"
24#include "kmp_str.h"
25#if KMP_OS_WINDOWS && KMP_ARCH_X86
26#include <float.h>
27#endif
28
29#if OMPT_SUPPORT
30#include "ompt-internal.h"
31#include "ompt-specific.h"
32#endif
33
34/* ------------------------------------------------------------------------ */
35/* ------------------------------------------------------------------------ */
36#if KMP_USE_HIER_SCHED
37// Forward declarations of some hierarchical scheduling data structures
38template <typename T> struct kmp_hier_t;
39template <typename T> struct kmp_hier_top_unit_t;
40#endif // KMP_USE_HIER_SCHED
41
42template <typename T> struct dispatch_shared_info_template;
43template <typename T> struct dispatch_private_info_template;
44
45template <typename T>
46extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
47                                          dispatch_private_info_template<T> *pr,
48                                          enum sched_type schedule, T lb, T ub,
49                                          typename traits_t<T>::signed_t st,
50#if USE_ITT_BUILD
51                                          kmp_uint64 *cur_chunk,
52#endif
53                                          typename traits_t<T>::signed_t chunk,
54                                          T nproc, T unit_id);
55template <typename T>
56extern int __kmp_dispatch_next_algorithm(
57    int gtid, dispatch_private_info_template<T> *pr,
58    dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb,
59    T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id);
60
61void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
62void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
63
64#if KMP_STATIC_STEAL_ENABLED
65
66// replaces dispatch_private_info{32,64} structures and
67// dispatch_private_info{32,64}_t types
68template <typename T> struct dispatch_private_infoXX_template {
69  typedef typename traits_t<T>::unsigned_t UT;
70  typedef typename traits_t<T>::signed_t ST;
71  UT count; // unsigned
72  T ub;
73  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
74  T lb;
75  ST st; // signed
76  UT tc; // unsigned
77  kmp_lock_t *steal_lock; // lock used for chunk stealing
78
79  UT ordered_lower; // unsigned
80  UT ordered_upper; // unsigned
81
82  /* parm[1-4] are used in different ways by different scheduling algorithms */
83
84  // KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on )
85  //    a) parm3 is properly aligned and
86  //    b) all parm1-4 are in the same cache line.
87  // Because of parm1-4 are used together, performance seems to be better
88  // if they are in the same line (not measured though).
89  struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
90    T parm1;
91    T parm2;
92    T parm3;
93    T parm4;
94  };
95
96#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
97  UT pchunks; // total number of chunks for processes with p-core
98  UT num_procs_with_pcore; // number of threads with p-core
99  T first_thread_with_ecore;
100#endif
101#if KMP_OS_WINDOWS
102  T last_upper;
103#endif /* KMP_OS_WINDOWS */
104};
105
106#else /* KMP_STATIC_STEAL_ENABLED */
107
108// replaces dispatch_private_info{32,64} structures and
109// dispatch_private_info{32,64}_t types
110template <typename T> struct dispatch_private_infoXX_template {
111  typedef typename traits_t<T>::unsigned_t UT;
112  typedef typename traits_t<T>::signed_t ST;
113  T lb;
114  T ub;
115  ST st; // signed
116  UT tc; // unsigned
117
118  T parm1;
119  T parm2;
120  T parm3;
121  T parm4;
122
123  UT count; // unsigned
124
125  UT ordered_lower; // unsigned
126  UT ordered_upper; // unsigned
127#if KMP_OS_WINDOWS
128  T last_upper;
129#endif /* KMP_OS_WINDOWS */
130};
131#endif /* KMP_STATIC_STEAL_ENABLED */
132
133template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
134  // duplicate alignment here, otherwise size of structure is not correct in our
135  // compiler
136  union KMP_ALIGN_CACHE private_info_tmpl {
137    dispatch_private_infoXX_template<T> p;
138    dispatch_private_info64_t p64;
139  } u;
140  enum sched_type schedule; /* scheduling algorithm */
141  kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
142  std::atomic<kmp_uint32> steal_flag; // static_steal only, state of a buffer
143  kmp_uint32 ordered_bumped;
144  dispatch_private_info *next; /* stack of buffers for nest of serial regions */
145  kmp_uint32 type_size;
146#if KMP_USE_HIER_SCHED
147  kmp_int32 hier_id;
148  kmp_hier_top_unit_t<T> *hier_parent;
149  // member functions
150  kmp_int32 get_hier_id() const { return hier_id; }
151  kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
152#endif
153  enum cons_type pushed_ws;
154};
155
156// replaces dispatch_shared_info{32,64} structures and
157// dispatch_shared_info{32,64}_t types
158template <typename T> struct dispatch_shared_infoXX_template {
159  typedef typename traits_t<T>::unsigned_t UT;
160  typedef typename traits_t<T>::signed_t ST;
161  /* chunk index under dynamic, number of idle threads under static-steal;
162     iteration index otherwise */
163  volatile UT iteration;
164  volatile ST num_done;
165  volatile UT ordered_iteration;
166  // to retain the structure size making ordered_iteration scalar
167  UT ordered_dummy[KMP_MAX_ORDERED - 3];
168};
169
170// replaces dispatch_shared_info structure and dispatch_shared_info_t type
171template <typename T> struct dispatch_shared_info_template {
172  typedef typename traits_t<T>::unsigned_t UT;
173  // we need union here to keep the structure size
174  union shared_info_tmpl {
175    dispatch_shared_infoXX_template<UT> s;
176    dispatch_shared_info64_t s64;
177  } u;
178  volatile kmp_uint32 buffer_index;
179  volatile kmp_int32 doacross_buf_idx; // teamwise index
180  kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
181  kmp_int32 doacross_num_done; // count finished threads
182#if KMP_USE_HIER_SCHED
183  kmp_hier_t<T> *hier;
184#endif
185#if KMP_USE_HWLOC
186  // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
187  // machines (> 48 cores). Performance analysis showed that a cache thrash
188  // was occurring and this padding helps alleviate the problem.
189  char padding[64];
190#endif
191};
192
193/* ------------------------------------------------------------------------ */
194/* ------------------------------------------------------------------------ */
195
196#undef USE_TEST_LOCKS
197
198// test_then_add template (general template should NOT be used)
199template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
200
201template <>
202__forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
203                                                 kmp_int32 d) {
204  kmp_int32 r;
205  r = KMP_TEST_THEN_ADD32(p, d);
206  return r;
207}
208
209template <>
210__forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
211                                                 kmp_int64 d) {
212  kmp_int64 r;
213  r = KMP_TEST_THEN_ADD64(p, d);
214  return r;
215}
216
217// test_then_inc_acq template (general template should NOT be used)
218template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
219
220template <>
221__forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
222  kmp_int32 r;
223  r = KMP_TEST_THEN_INC_ACQ32(p);
224  return r;
225}
226
227template <>
228__forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
229  kmp_int64 r;
230  r = KMP_TEST_THEN_INC_ACQ64(p);
231  return r;
232}
233
234// test_then_inc template (general template should NOT be used)
235template <typename T> static __forceinline T test_then_inc(volatile T *p);
236
237template <>
238__forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
239  kmp_int32 r;
240  r = KMP_TEST_THEN_INC32(p);
241  return r;
242}
243
244template <>
245__forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
246  kmp_int64 r;
247  r = KMP_TEST_THEN_INC64(p);
248  return r;
249}
250
251// compare_and_swap template (general template should NOT be used)
252template <typename T>
253static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
254
255template <>
256__forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
257                                                    kmp_int32 c, kmp_int32 s) {
258  return KMP_COMPARE_AND_STORE_REL32(p, c, s);
259}
260
261template <>
262__forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
263                                                    kmp_int64 c, kmp_int64 s) {
264  return KMP_COMPARE_AND_STORE_REL64(p, c, s);
265}
266
267template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {
268  return value >= checker;
269}
270template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
271  return value == checker;
272}
273
274/*
275    Spin wait loop that pauses between checks.
276    Waits until function returns non-zero when called with *spinner and check.
277    Does NOT put threads to sleep.
278    Arguments:
279        UT is unsigned 4- or 8-byte type
280        spinner - memory location to check value
281        checker - value which spinner is >, <, ==, etc.
282        pred - predicate function to perform binary comparison of some sort
283#if USE_ITT_BUILD
284        obj -- is higher-level synchronization object to report to ittnotify. It
285        is used to report locks consistently. For example, if lock is acquired
286        immediately, its address is reported to ittnotify via
287        KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
288        and lock routine calls to KMP_WAIT(), the later should report the
289        same address, not an address of low-level spinner.
290#endif // USE_ITT_BUILD
291    TODO: make inline function (move to header file for icl)
292*/
293template <typename UT>
294static UT __kmp_wait(volatile UT *spinner, UT checker,
295                     kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(void *obj)) {
296  // note: we may not belong to a team at this point
297  volatile UT *spin = spinner;
298  UT check = checker;
299  kmp_uint32 spins;
300  kmp_uint32 (*f)(UT, UT) = pred;
301  kmp_uint64 time;
302  UT r;
303
304  KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
305  KMP_INIT_YIELD(spins);
306  KMP_INIT_BACKOFF(time);
307  // main wait spin loop
308  while (!f(r = *spin, check)) {
309    KMP_FSYNC_SPIN_PREPARE(obj);
310    /* GEH - remove this since it was accidentally introduced when kmp_wait was
311       split.
312       It causes problems with infinite recursion because of exit lock */
313    /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
314        __kmp_abort_thread(); */
315    // If oversubscribed, or have waited a bit then yield.
316    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
317  }
318  KMP_FSYNC_SPIN_ACQUIRED(obj);
319  return r;
320}
321
322/* ------------------------------------------------------------------------ */
323/* ------------------------------------------------------------------------ */
324
325template <typename UT>
326void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
327  dispatch_private_info_template<UT> *pr;
328
329  int gtid = *gtid_ref;
330  //    int  cid = *cid_ref;
331  kmp_info_t *th = __kmp_threads[gtid];
332  KMP_DEBUG_ASSERT(th->th.th_dispatch);
333
334  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
335  if (__kmp_env_consistency_check) {
336    pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
337        th->th.th_dispatch->th_dispatch_pr_current);
338    if (pr->pushed_ws != ct_none) {
339#if KMP_USE_DYNAMIC_LOCK
340      __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
341#else
342      __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
343#endif
344    }
345  }
346
347  if (!th->th.th_team->t.t_serialized) {
348    dispatch_shared_info_template<UT> *sh =
349        reinterpret_cast<dispatch_shared_info_template<UT> *>(
350            th->th.th_dispatch->th_dispatch_sh_current);
351    UT lower;
352
353    if (!__kmp_env_consistency_check) {
354      pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
355          th->th.th_dispatch->th_dispatch_pr_current);
356    }
357    lower = pr->u.p.ordered_lower;
358
359#if !defined(KMP_GOMP_COMPAT)
360    if (__kmp_env_consistency_check) {
361      if (pr->ordered_bumped) {
362        struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
363        __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
364                               ct_ordered_in_pdo, loc_ref,
365                               &p->stack_data[p->w_top]);
366      }
367    }
368#endif /* !defined(KMP_GOMP_COMPAT) */
369
370    KMP_MB();
371#ifdef KMP_DEBUG
372    {
373      char *buff;
374      // create format specifiers before the debug output
375      buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
376                              "ordered_iter:%%%s lower:%%%s\n",
377                              traits_t<UT>::spec, traits_t<UT>::spec);
378      KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
379      __kmp_str_free(&buff);
380    }
381#endif
382    __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
383                   __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
384    KMP_MB(); /* is this necessary? */
385#ifdef KMP_DEBUG
386    {
387      char *buff;
388      // create format specifiers before the debug output
389      buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
390                              "ordered_iter:%%%s lower:%%%s\n",
391                              traits_t<UT>::spec, traits_t<UT>::spec);
392      KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
393      __kmp_str_free(&buff);
394    }
395#endif
396  }
397  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
398}
399
400template <typename UT>
401void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
402  typedef typename traits_t<UT>::signed_t ST;
403  dispatch_private_info_template<UT> *pr;
404
405  int gtid = *gtid_ref;
406  //    int  cid = *cid_ref;
407  kmp_info_t *th = __kmp_threads[gtid];
408  KMP_DEBUG_ASSERT(th->th.th_dispatch);
409
410  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
411  if (__kmp_env_consistency_check) {
412    pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
413        th->th.th_dispatch->th_dispatch_pr_current);
414    if (pr->pushed_ws != ct_none) {
415      __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
416    }
417  }
418
419  if (!th->th.th_team->t.t_serialized) {
420    dispatch_shared_info_template<UT> *sh =
421        reinterpret_cast<dispatch_shared_info_template<UT> *>(
422            th->th.th_dispatch->th_dispatch_sh_current);
423
424    if (!__kmp_env_consistency_check) {
425      pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
426          th->th.th_dispatch->th_dispatch_pr_current);
427    }
428
429    KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
430#if !defined(KMP_GOMP_COMPAT)
431    if (__kmp_env_consistency_check) {
432      if (pr->ordered_bumped != 0) {
433        struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
434        /* How to test it? - OM */
435        __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
436                               ct_ordered_in_pdo, loc_ref,
437                               &p->stack_data[p->w_top]);
438      }
439    }
440#endif /* !defined(KMP_GOMP_COMPAT) */
441
442    KMP_MB(); /* Flush all pending memory write invalidates.  */
443
444    pr->ordered_bumped += 1;
445
446    KD_TRACE(1000,
447             ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
448              gtid, pr->ordered_bumped));
449
450    KMP_MB(); /* Flush all pending memory write invalidates.  */
451
452    /* TODO use general release procedure? */
453    test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
454
455    KMP_MB(); /* Flush all pending memory write invalidates.  */
456  }
457  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
458}
459
460/* Computes and returns x to the power of y, where y must a non-negative integer
461 */
462template <typename UT>
463static __forceinline long double __kmp_pow(long double x, UT y) {
464  long double s = 1.0L;
465
466  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
467  // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
468  while (y) {
469    if (y & 1)
470      s *= x;
471    x *= x;
472    y >>= 1;
473  }
474  return s;
475}
476
477/* Computes and returns the number of unassigned iterations after idx chunks
478   have been assigned
479   (the total number of unassigned iterations in chunks with index greater than
480   or equal to idx).
481   __forceinline seems to be broken so that if we __forceinline this function,
482   the behavior is wrong
483   (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
484*/
485template <typename T>
486static __inline typename traits_t<T>::unsigned_t
487__kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
488                                typename traits_t<T>::unsigned_t idx) {
489  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
490     least for ICL 8.1, long double arithmetic may not really have
491     long double precision, even with /Qlong_double.  Currently, we
492     workaround that in the caller code, by manipulating the FPCW for
493     Windows* OS on IA-32 architecture.  The lack of precision is not
494     expected to be a correctness issue, though.
495  */
496  typedef typename traits_t<T>::unsigned_t UT;
497
498  long double x = tc * __kmp_pow<UT>(base, idx);
499  UT r = (UT)x;
500  if (x == r)
501    return r;
502  return r + 1;
503}
504
505// Parameters of the guided-iterative algorithm:
506//   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
507//   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
508// by default n = 2. For example with n = 3 the chunks distribution will be more
509// flat.
510// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
511static const int guided_int_param = 2;
512static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
513#endif // KMP_DISPATCH_H
514