1/*
2 * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef KMP_DISPATCH_H
14#define KMP_DISPATCH_H
15
16/* ------------------------------------------------------------------------ */
17/* ------------------------------------------------------------------------ */
18
19#include "kmp.h"
20#include "kmp_error.h"
21#include "kmp_i18n.h"
22#include "kmp_itt.h"
23#include "kmp_stats.h"
24#include "kmp_str.h"
25#if KMP_OS_WINDOWS && KMP_ARCH_X86
26#include <float.h>
27#endif
28
29#if OMPT_SUPPORT
30#include "ompt-internal.h"
31#include "ompt-specific.h"
32#endif
33
34/* ------------------------------------------------------------------------ */
35/* ------------------------------------------------------------------------ */
36#if KMP_USE_HIER_SCHED
37// Forward declarations of some hierarchical scheduling data structures
38template <typename T> struct kmp_hier_t;
39template <typename T> struct kmp_hier_top_unit_t;
40#endif // KMP_USE_HIER_SCHED
41
42template <typename T> struct dispatch_shared_info_template;
43template <typename T> struct dispatch_private_info_template;
44
45template <typename T>
46extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
47                                          dispatch_private_info_template<T> *pr,
48                                          enum sched_type schedule, T lb, T ub,
49                                          typename traits_t<T>::signed_t st,
50#if USE_ITT_BUILD
51                                          kmp_uint64 *cur_chunk,
52#endif
53                                          typename traits_t<T>::signed_t chunk,
54                                          T nproc, T unit_id);
55template <typename T>
56extern int __kmp_dispatch_next_algorithm(
57    int gtid, dispatch_private_info_template<T> *pr,
58    dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb,
59    T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id);
60
61void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
62void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
63
64#if KMP_STATIC_STEAL_ENABLED
65
66// replaces dispatch_private_info{32,64} structures and
67// dispatch_private_info{32,64}_t types
68template <typename T> struct dispatch_private_infoXX_template {
69  typedef typename traits_t<T>::unsigned_t UT;
70  typedef typename traits_t<T>::signed_t ST;
71  UT count; // unsigned
72  T ub;
73  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
74  T lb;
75  ST st; // signed
76  UT tc; // unsigned
77  T static_steal_counter; // for static_steal only; maybe better to put after ub
78
79  /* parm[1-4] are used in different ways by different scheduling algorithms */
80
81  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
82  //    a) parm3 is properly aligned and
83  //    b) all parm1-4 are in the same cache line.
84  // Because of parm1-4 are used together, performance seems to be better
85  // if they are in the same line (not measured though).
86
87  struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
88    T parm1;
89    T parm2;
90    T parm3;
91    T parm4;
92  };
93
94  UT ordered_lower; // unsigned
95  UT ordered_upper; // unsigned
96#if KMP_OS_WINDOWS
97  T last_upper;
98#endif /* KMP_OS_WINDOWS */
99};
100
101#else /* KMP_STATIC_STEAL_ENABLED */
102
103// replaces dispatch_private_info{32,64} structures and
104// dispatch_private_info{32,64}_t types
105template <typename T> struct dispatch_private_infoXX_template {
106  typedef typename traits_t<T>::unsigned_t UT;
107  typedef typename traits_t<T>::signed_t ST;
108  T lb;
109  T ub;
110  ST st; // signed
111  UT tc; // unsigned
112
113  T parm1;
114  T parm2;
115  T parm3;
116  T parm4;
117
118  UT count; // unsigned
119
120  UT ordered_lower; // unsigned
121  UT ordered_upper; // unsigned
122#if KMP_OS_WINDOWS
123  T last_upper;
124#endif /* KMP_OS_WINDOWS */
125};
126#endif /* KMP_STATIC_STEAL_ENABLED */
127
128template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
129  // duplicate alignment here, otherwise size of structure is not correct in our
130  // compiler
131  union KMP_ALIGN_CACHE private_info_tmpl {
132    dispatch_private_infoXX_template<T> p;
133    dispatch_private_info64_t p64;
134  } u;
135  enum sched_type schedule; /* scheduling algorithm */
136  kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
137  kmp_uint32 ordered_bumped;
138  // to retain the structure size after making order
139  kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
140  dispatch_private_info *next; /* stack of buffers for nest of serial regions */
141  kmp_uint32 type_size;
142#if KMP_USE_HIER_SCHED
143  kmp_int32 hier_id;
144  kmp_hier_top_unit_t<T> *hier_parent;
145  // member functions
146  kmp_int32 get_hier_id() const { return hier_id; }
147  kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
148#endif
149  enum cons_type pushed_ws;
150};
151
152// replaces dispatch_shared_info{32,64} structures and
153// dispatch_shared_info{32,64}_t types
154template <typename T> struct dispatch_shared_infoXX_template {
155  typedef typename traits_t<T>::unsigned_t UT;
156  /* chunk index under dynamic, number of idle threads under static-steal;
157     iteration index otherwise */
158  volatile UT iteration;
159  volatile UT num_done;
160  volatile UT ordered_iteration;
161  // to retain the structure size making ordered_iteration scalar
162  UT ordered_dummy[KMP_MAX_ORDERED - 3];
163};
164
165// replaces dispatch_shared_info structure and dispatch_shared_info_t type
166template <typename T> struct dispatch_shared_info_template {
167  typedef typename traits_t<T>::unsigned_t UT;
168  // we need union here to keep the structure size
169  union shared_info_tmpl {
170    dispatch_shared_infoXX_template<UT> s;
171    dispatch_shared_info64_t s64;
172  } u;
173  volatile kmp_uint32 buffer_index;
174  volatile kmp_int32 doacross_buf_idx; // teamwise index
175  kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
176  kmp_int32 doacross_num_done; // count finished threads
177#if KMP_USE_HIER_SCHED
178  kmp_hier_t<T> *hier;
179#endif
180#if KMP_USE_HWLOC
181  // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
182  // machines (> 48 cores). Performance analysis showed that a cache thrash
183  // was occurring and this padding helps alleviate the problem.
184  char padding[64];
185#endif
186};
187
188/* ------------------------------------------------------------------------ */
189/* ------------------------------------------------------------------------ */
190
191#undef USE_TEST_LOCKS
192
193// test_then_add template (general template should NOT be used)
194template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
195
196template <>
197__forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
198                                                 kmp_int32 d) {
199  kmp_int32 r;
200  r = KMP_TEST_THEN_ADD32(p, d);
201  return r;
202}
203
204template <>
205__forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
206                                                 kmp_int64 d) {
207  kmp_int64 r;
208  r = KMP_TEST_THEN_ADD64(p, d);
209  return r;
210}
211
212// test_then_inc_acq template (general template should NOT be used)
213template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
214
215template <>
216__forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
217  kmp_int32 r;
218  r = KMP_TEST_THEN_INC_ACQ32(p);
219  return r;
220}
221
222template <>
223__forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
224  kmp_int64 r;
225  r = KMP_TEST_THEN_INC_ACQ64(p);
226  return r;
227}
228
229// test_then_inc template (general template should NOT be used)
230template <typename T> static __forceinline T test_then_inc(volatile T *p);
231
232template <>
233__forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
234  kmp_int32 r;
235  r = KMP_TEST_THEN_INC32(p);
236  return r;
237}
238
239template <>
240__forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
241  kmp_int64 r;
242  r = KMP_TEST_THEN_INC64(p);
243  return r;
244}
245
246// compare_and_swap template (general template should NOT be used)
247template <typename T>
248static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
249
250template <>
251__forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
252                                                    kmp_int32 c, kmp_int32 s) {
253  return KMP_COMPARE_AND_STORE_REL32(p, c, s);
254}
255
256template <>
257__forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
258                                                    kmp_int64 c, kmp_int64 s) {
259  return KMP_COMPARE_AND_STORE_REL64(p, c, s);
260}
261
262template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {
263  return value >= checker;
264}
265template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
266  return value == checker;
267}
268
269/*
270    Spin wait loop that pauses between checks.
271    Waits until function returns non-zero when called with *spinner and check.
272    Does NOT put threads to sleep.
273    Arguments:
274        UT is unsigned 4- or 8-byte type
275        spinner - memory location to check value
276        checker - value which spinner is >, <, ==, etc.
277        pred - predicate function to perform binary comparison of some sort
278#if USE_ITT_BUILD
279        obj -- is higher-level synchronization object to report to ittnotify. It
280        is used to report locks consistently. For example, if lock is acquired
281        immediately, its address is reported to ittnotify via
282        KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
283        and lock routine calls to KMP_WAIT(), the later should report the
284        same address, not an address of low-level spinner.
285#endif // USE_ITT_BUILD
286    TODO: make inline function (move to header file for icl)
287*/
288template <typename UT>
289static UT __kmp_wait(volatile UT *spinner, UT checker,
290                     kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(void *obj)) {
291  // note: we may not belong to a team at this point
292  volatile UT *spin = spinner;
293  UT check = checker;
294  kmp_uint32 spins;
295  kmp_uint32 (*f)(UT, UT) = pred;
296  UT r;
297
298  KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
299  KMP_INIT_YIELD(spins);
300  // main wait spin loop
301  while (!f(r = *spin, check)) {
302    KMP_FSYNC_SPIN_PREPARE(obj);
303    /* GEH - remove this since it was accidentally introduced when kmp_wait was
304       split.
305       It causes problems with infinite recursion because of exit lock */
306    /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
307        __kmp_abort_thread(); */
308    // If oversubscribed, or have waited a bit then yield.
309    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
310  }
311  KMP_FSYNC_SPIN_ACQUIRED(obj);
312  return r;
313}
314
315/* ------------------------------------------------------------------------ */
316/* ------------------------------------------------------------------------ */
317
318template <typename UT>
319void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
320  dispatch_private_info_template<UT> *pr;
321
322  int gtid = *gtid_ref;
323  //    int  cid = *cid_ref;
324  kmp_info_t *th = __kmp_threads[gtid];
325  KMP_DEBUG_ASSERT(th->th.th_dispatch);
326
327  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
328  if (__kmp_env_consistency_check) {
329    pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
330        th->th.th_dispatch->th_dispatch_pr_current);
331    if (pr->pushed_ws != ct_none) {
332#if KMP_USE_DYNAMIC_LOCK
333      __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
334#else
335      __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
336#endif
337    }
338  }
339
340  if (!th->th.th_team->t.t_serialized) {
341    dispatch_shared_info_template<UT> *sh =
342        reinterpret_cast<dispatch_shared_info_template<UT> *>(
343            th->th.th_dispatch->th_dispatch_sh_current);
344    UT lower;
345
346    if (!__kmp_env_consistency_check) {
347      pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
348          th->th.th_dispatch->th_dispatch_pr_current);
349    }
350    lower = pr->u.p.ordered_lower;
351
352#if !defined(KMP_GOMP_COMPAT)
353    if (__kmp_env_consistency_check) {
354      if (pr->ordered_bumped) {
355        struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
356        __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
357                               ct_ordered_in_pdo, loc_ref,
358                               &p->stack_data[p->w_top]);
359      }
360    }
361#endif /* !defined(KMP_GOMP_COMPAT) */
362
363    KMP_MB();
364#ifdef KMP_DEBUG
365    {
366      char *buff;
367      // create format specifiers before the debug output
368      buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
369                              "ordered_iter:%%%s lower:%%%s\n",
370                              traits_t<UT>::spec, traits_t<UT>::spec);
371      KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
372      __kmp_str_free(&buff);
373    }
374#endif
375    __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
376                   __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
377    KMP_MB(); /* is this necessary? */
378#ifdef KMP_DEBUG
379    {
380      char *buff;
381      // create format specifiers before the debug output
382      buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
383                              "ordered_iter:%%%s lower:%%%s\n",
384                              traits_t<UT>::spec, traits_t<UT>::spec);
385      KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
386      __kmp_str_free(&buff);
387    }
388#endif
389  }
390  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
391}
392
393template <typename UT>
394void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
395  typedef typename traits_t<UT>::signed_t ST;
396  dispatch_private_info_template<UT> *pr;
397
398  int gtid = *gtid_ref;
399  //    int  cid = *cid_ref;
400  kmp_info_t *th = __kmp_threads[gtid];
401  KMP_DEBUG_ASSERT(th->th.th_dispatch);
402
403  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
404  if (__kmp_env_consistency_check) {
405    pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
406        th->th.th_dispatch->th_dispatch_pr_current);
407    if (pr->pushed_ws != ct_none) {
408      __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
409    }
410  }
411
412  if (!th->th.th_team->t.t_serialized) {
413    dispatch_shared_info_template<UT> *sh =
414        reinterpret_cast<dispatch_shared_info_template<UT> *>(
415            th->th.th_dispatch->th_dispatch_sh_current);
416
417    if (!__kmp_env_consistency_check) {
418      pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
419          th->th.th_dispatch->th_dispatch_pr_current);
420    }
421
422    KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
423#if !defined(KMP_GOMP_COMPAT)
424    if (__kmp_env_consistency_check) {
425      if (pr->ordered_bumped != 0) {
426        struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
427        /* How to test it? - OM */
428        __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
429                               ct_ordered_in_pdo, loc_ref,
430                               &p->stack_data[p->w_top]);
431      }
432    }
433#endif /* !defined(KMP_GOMP_COMPAT) */
434
435    KMP_MB(); /* Flush all pending memory write invalidates.  */
436
437    pr->ordered_bumped += 1;
438
439    KD_TRACE(1000,
440             ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
441              gtid, pr->ordered_bumped));
442
443    KMP_MB(); /* Flush all pending memory write invalidates.  */
444
445    /* TODO use general release procedure? */
446    test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
447
448    KMP_MB(); /* Flush all pending memory write invalidates.  */
449  }
450  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
451}
452
453/* Computes and returns x to the power of y, where y must a non-negative integer
454 */
455template <typename UT>
456static __forceinline long double __kmp_pow(long double x, UT y) {
457  long double s = 1.0L;
458
459  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
460  // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
461  while (y) {
462    if (y & 1)
463      s *= x;
464    x *= x;
465    y >>= 1;
466  }
467  return s;
468}
469
470/* Computes and returns the number of unassigned iterations after idx chunks
471   have been assigned
472   (the total number of unassigned iterations in chunks with index greater than
473   or equal to idx).
474   __forceinline seems to be broken so that if we __forceinline this function,
475   the behavior is wrong
476   (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
477*/
478template <typename T>
479static __inline typename traits_t<T>::unsigned_t
480__kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
481                                typename traits_t<T>::unsigned_t idx) {
482  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
483     least for ICL 8.1, long double arithmetic may not really have
484     long double precision, even with /Qlong_double.  Currently, we
485     workaround that in the caller code, by manipulating the FPCW for
486     Windows* OS on IA-32 architecture.  The lack of precision is not
487     expected to be a correctness issue, though.
488  */
489  typedef typename traits_t<T>::unsigned_t UT;
490
491  long double x = tc * __kmp_pow<UT>(base, idx);
492  UT r = (UT)x;
493  if (x == r)
494    return r;
495  return r + 1;
496}
497
498// Parameters of the guided-iterative algorithm:
499//   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
500//   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
501// by default n = 2. For example with n = 3 the chunks distribution will be more
502// flat.
503// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
504static const int guided_int_param = 2;
505static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
506#endif // KMP_DISPATCH_H
507