1/*
2 * kmp_affinity.h -- header for affinity management
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef KMP_AFFINITY_H
14#define KMP_AFFINITY_H
15
16#include "kmp.h"
17#include "kmp_os.h"
18#include <limits>
19
20#if KMP_AFFINITY_SUPPORTED
21#if KMP_USE_HWLOC
22class KMPHwlocAffinity : public KMPAffinity {
23public:
24  class Mask : public KMPAffinity::Mask {
25    hwloc_cpuset_t mask;
26
27  public:
28    Mask() {
29      mask = hwloc_bitmap_alloc();
30      this->zero();
31    }
32    ~Mask() { hwloc_bitmap_free(mask); }
33    void set(int i) override { hwloc_bitmap_set(mask, i); }
34    bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
35    void clear(int i) override { hwloc_bitmap_clr(mask, i); }
36    void zero() override { hwloc_bitmap_zero(mask); }
37    bool empty() const override { return hwloc_bitmap_iszero(mask); }
38    void copy(const KMPAffinity::Mask *src) override {
39      const Mask *convert = static_cast<const Mask *>(src);
40      hwloc_bitmap_copy(mask, convert->mask);
41    }
42    void bitwise_and(const KMPAffinity::Mask *rhs) override {
43      const Mask *convert = static_cast<const Mask *>(rhs);
44      hwloc_bitmap_and(mask, mask, convert->mask);
45    }
46    void bitwise_or(const KMPAffinity::Mask *rhs) override {
47      const Mask *convert = static_cast<const Mask *>(rhs);
48      hwloc_bitmap_or(mask, mask, convert->mask);
49    }
50    void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
51    bool is_equal(const KMPAffinity::Mask *rhs) const override {
52      const Mask *convert = static_cast<const Mask *>(rhs);
53      return hwloc_bitmap_isequal(mask, convert->mask);
54    }
55    int begin() const override { return hwloc_bitmap_first(mask); }
56    int end() const override { return -1; }
57    int next(int previous) const override {
58      return hwloc_bitmap_next(mask, previous);
59    }
60    int get_system_affinity(bool abort_on_error) override {
61      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
62                  "Illegal get affinity operation when not capable");
63      long retval =
64          hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
65      if (retval >= 0) {
66        return 0;
67      }
68      int error = errno;
69      if (abort_on_error) {
70        __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
71                    KMP_ERR(error), __kmp_msg_null);
72      }
73      return error;
74    }
75    int set_system_affinity(bool abort_on_error) const override {
76      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
77                  "Illegal set affinity operation when not capable");
78      long retval =
79          hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
80      if (retval >= 0) {
81        return 0;
82      }
83      int error = errno;
84      if (abort_on_error) {
85        __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
86                    KMP_ERR(error), __kmp_msg_null);
87      }
88      return error;
89    }
90#if KMP_OS_WINDOWS
91    int set_process_affinity(bool abort_on_error) const override {
92      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
93                  "Illegal set process affinity operation when not capable");
94      int error = 0;
95      const hwloc_topology_support *support =
96          hwloc_topology_get_support(__kmp_hwloc_topology);
97      if (support->cpubind->set_proc_cpubind) {
98        int retval;
99        retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
100                                   HWLOC_CPUBIND_PROCESS);
101        if (retval >= 0)
102          return 0;
103        error = errno;
104        if (abort_on_error)
105          __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
106                      KMP_ERR(error), __kmp_msg_null);
107      }
108      return error;
109    }
110#endif
111    int get_proc_group() const override {
112      int group = -1;
113#if KMP_OS_WINDOWS
114      if (__kmp_num_proc_groups == 1) {
115        return 1;
116      }
117      for (int i = 0; i < __kmp_num_proc_groups; i++) {
118        // On windows, the long type is always 32 bits
119        unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
120        unsigned long second_32_bits =
121            hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
122        if (first_32_bits == 0 && second_32_bits == 0) {
123          continue;
124        }
125        if (group >= 0) {
126          return -1;
127        }
128        group = i;
129      }
130#endif /* KMP_OS_WINDOWS */
131      return group;
132    }
133  };
134  void determine_capable(const char *var) override {
135    const hwloc_topology_support *topology_support;
136    if (__kmp_hwloc_topology == NULL) {
137      if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
138        __kmp_hwloc_error = TRUE;
139        if (__kmp_affinity.flags.verbose) {
140          KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
141        }
142      }
143      if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
144        __kmp_hwloc_error = TRUE;
145        if (__kmp_affinity.flags.verbose) {
146          KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
147        }
148      }
149    }
150    topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
151    // Is the system capable of setting/getting this thread's affinity?
152    // Also, is topology discovery possible? (pu indicates ability to discover
153    // processing units). And finally, were there no errors when calling any
154    // hwloc_* API functions?
155    if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
156        topology_support->cpubind->get_thisthread_cpubind &&
157        topology_support->discovery->pu && !__kmp_hwloc_error) {
158      // enables affinity according to KMP_AFFINITY_CAPABLE() macro
159      KMP_AFFINITY_ENABLE(TRUE);
160    } else {
161      // indicate that hwloc didn't work and disable affinity
162      __kmp_hwloc_error = TRUE;
163      KMP_AFFINITY_DISABLE();
164    }
165  }
166  void bind_thread(int which) override {
167    KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
168                "Illegal set affinity operation when not capable");
169    KMPAffinity::Mask *mask;
170    KMP_CPU_ALLOC_ON_STACK(mask);
171    KMP_CPU_ZERO(mask);
172    KMP_CPU_SET(which, mask);
173    __kmp_set_system_affinity(mask, TRUE);
174    KMP_CPU_FREE_FROM_STACK(mask);
175  }
176  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
177  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
178  KMPAffinity::Mask *allocate_mask_array(int num) override {
179    return new Mask[num];
180  }
181  void deallocate_mask_array(KMPAffinity::Mask *array) override {
182    Mask *hwloc_array = static_cast<Mask *>(array);
183    delete[] hwloc_array;
184  }
185  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
186                                      int index) override {
187    Mask *hwloc_array = static_cast<Mask *>(array);
188    return &(hwloc_array[index]);
189  }
190  api_type get_api_type() const override { return HWLOC; }
191};
192#endif /* KMP_USE_HWLOC */
193
194#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX
195#if KMP_OS_LINUX
196/* On some of the older OS's that we build on, these constants aren't present
197   in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
198   all systems of the same arch where they are defined, and they cannot change.
199   stone forever. */
200#include <sys/syscall.h>
201#if KMP_ARCH_X86 || KMP_ARCH_ARM
202#ifndef __NR_sched_setaffinity
203#define __NR_sched_setaffinity 241
204#elif __NR_sched_setaffinity != 241
205#error Wrong code for setaffinity system call.
206#endif /* __NR_sched_setaffinity */
207#ifndef __NR_sched_getaffinity
208#define __NR_sched_getaffinity 242
209#elif __NR_sched_getaffinity != 242
210#error Wrong code for getaffinity system call.
211#endif /* __NR_sched_getaffinity */
212#elif KMP_ARCH_AARCH64
213#ifndef __NR_sched_setaffinity
214#define __NR_sched_setaffinity 122
215#elif __NR_sched_setaffinity != 122
216#error Wrong code for setaffinity system call.
217#endif /* __NR_sched_setaffinity */
218#ifndef __NR_sched_getaffinity
219#define __NR_sched_getaffinity 123
220#elif __NR_sched_getaffinity != 123
221#error Wrong code for getaffinity system call.
222#endif /* __NR_sched_getaffinity */
223#elif KMP_ARCH_X86_64
224#ifndef __NR_sched_setaffinity
225#define __NR_sched_setaffinity 203
226#elif __NR_sched_setaffinity != 203
227#error Wrong code for setaffinity system call.
228#endif /* __NR_sched_setaffinity */
229#ifndef __NR_sched_getaffinity
230#define __NR_sched_getaffinity 204
231#elif __NR_sched_getaffinity != 204
232#error Wrong code for getaffinity system call.
233#endif /* __NR_sched_getaffinity */
234#elif KMP_ARCH_PPC64
235#ifndef __NR_sched_setaffinity
236#define __NR_sched_setaffinity 222
237#elif __NR_sched_setaffinity != 222
238#error Wrong code for setaffinity system call.
239#endif /* __NR_sched_setaffinity */
240#ifndef __NR_sched_getaffinity
241#define __NR_sched_getaffinity 223
242#elif __NR_sched_getaffinity != 223
243#error Wrong code for getaffinity system call.
244#endif /* __NR_sched_getaffinity */
245#elif KMP_ARCH_MIPS
246#ifndef __NR_sched_setaffinity
247#define __NR_sched_setaffinity 4239
248#elif __NR_sched_setaffinity != 4239
249#error Wrong code for setaffinity system call.
250#endif /* __NR_sched_setaffinity */
251#ifndef __NR_sched_getaffinity
252#define __NR_sched_getaffinity 4240
253#elif __NR_sched_getaffinity != 4240
254#error Wrong code for getaffinity system call.
255#endif /* __NR_sched_getaffinity */
256#elif KMP_ARCH_MIPS64
257#ifndef __NR_sched_setaffinity
258#define __NR_sched_setaffinity 5195
259#elif __NR_sched_setaffinity != 5195
260#error Wrong code for setaffinity system call.
261#endif /* __NR_sched_setaffinity */
262#ifndef __NR_sched_getaffinity
263#define __NR_sched_getaffinity 5196
264#elif __NR_sched_getaffinity != 5196
265#error Wrong code for getaffinity system call.
266#endif /* __NR_sched_getaffinity */
267#elif KMP_ARCH_LOONGARCH64
268#ifndef __NR_sched_setaffinity
269#define __NR_sched_setaffinity 122
270#elif __NR_sched_setaffinity != 122
271#error Wrong code for setaffinity system call.
272#endif /* __NR_sched_setaffinity */
273#ifndef __NR_sched_getaffinity
274#define __NR_sched_getaffinity 123
275#elif __NR_sched_getaffinity != 123
276#error Wrong code for getaffinity system call.
277#endif /* __NR_sched_getaffinity */
278#elif KMP_ARCH_RISCV64
279#ifndef __NR_sched_setaffinity
280#define __NR_sched_setaffinity 122
281#elif __NR_sched_setaffinity != 122
282#error Wrong code for setaffinity system call.
283#endif /* __NR_sched_setaffinity */
284#ifndef __NR_sched_getaffinity
285#define __NR_sched_getaffinity 123
286#elif __NR_sched_getaffinity != 123
287#error Wrong code for getaffinity system call.
288#endif /* __NR_sched_getaffinity */
289#elif KMP_ARCH_VE
290#ifndef __NR_sched_setaffinity
291#define __NR_sched_setaffinity 203
292#elif __NR_sched_setaffinity != 203
293#error Wrong code for setaffinity system call.
294#endif /* __NR_sched_setaffinity */
295#ifndef __NR_sched_getaffinity
296#define __NR_sched_getaffinity 204
297#elif __NR_sched_getaffinity != 204
298#error Wrong code for getaffinity system call.
299#endif /* __NR_sched_getaffinity */
300#elif KMP_ARCH_S390X
301#ifndef __NR_sched_setaffinity
302#define __NR_sched_setaffinity 239
303#elif __NR_sched_setaffinity != 239
304#error Wrong code for setaffinity system call.
305#endif /* __NR_sched_setaffinity */
306#ifndef __NR_sched_getaffinity
307#define __NR_sched_getaffinity 240
308#elif __NR_sched_getaffinity != 240
309#error Wrong code for getaffinity system call.
310#endif /* __NR_sched_getaffinity */
311#else
312#error Unknown or unsupported architecture
313#endif /* KMP_ARCH_* */
314#elif KMP_OS_FREEBSD
315#include <pthread.h>
316#include <pthread_np.h>
317#elif KMP_OS_AIX
318#include <sys/dr.h>
319#include <sys/rset.h>
320#define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
321#endif
322class KMPNativeAffinity : public KMPAffinity {
323  class Mask : public KMPAffinity::Mask {
324    typedef unsigned long mask_t;
325    typedef decltype(__kmp_affin_mask_size) mask_size_type;
326    static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
327    static const mask_t ONE = 1;
328    mask_size_type get_num_mask_types() const {
329      return __kmp_affin_mask_size / sizeof(mask_t);
330    }
331
332  public:
333    mask_t *mask;
334    Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
335    ~Mask() {
336      if (mask)
337        __kmp_free(mask);
338    }
339    void set(int i) override {
340      mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
341    }
342    bool is_set(int i) const override {
343      return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
344    }
345    void clear(int i) override {
346      mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
347    }
348    void zero() override {
349      mask_size_type e = get_num_mask_types();
350      for (mask_size_type i = 0; i < e; ++i)
351        mask[i] = (mask_t)0;
352    }
353    bool empty() const override {
354      mask_size_type e = get_num_mask_types();
355      for (mask_size_type i = 0; i < e; ++i)
356        if (mask[i] != (mask_t)0)
357          return false;
358      return true;
359    }
360    void copy(const KMPAffinity::Mask *src) override {
361      const Mask *convert = static_cast<const Mask *>(src);
362      mask_size_type e = get_num_mask_types();
363      for (mask_size_type i = 0; i < e; ++i)
364        mask[i] = convert->mask[i];
365    }
366    void bitwise_and(const KMPAffinity::Mask *rhs) override {
367      const Mask *convert = static_cast<const Mask *>(rhs);
368      mask_size_type e = get_num_mask_types();
369      for (mask_size_type i = 0; i < e; ++i)
370        mask[i] &= convert->mask[i];
371    }
372    void bitwise_or(const KMPAffinity::Mask *rhs) override {
373      const Mask *convert = static_cast<const Mask *>(rhs);
374      mask_size_type e = get_num_mask_types();
375      for (mask_size_type i = 0; i < e; ++i)
376        mask[i] |= convert->mask[i];
377    }
378    void bitwise_not() override {
379      mask_size_type e = get_num_mask_types();
380      for (mask_size_type i = 0; i < e; ++i)
381        mask[i] = ~(mask[i]);
382    }
383    bool is_equal(const KMPAffinity::Mask *rhs) const override {
384      const Mask *convert = static_cast<const Mask *>(rhs);
385      mask_size_type e = get_num_mask_types();
386      for (mask_size_type i = 0; i < e; ++i)
387        if (mask[i] != convert->mask[i])
388          return false;
389      return true;
390    }
391    int begin() const override {
392      int retval = 0;
393      while (retval < end() && !is_set(retval))
394        ++retval;
395      return retval;
396    }
397    int end() const override {
398      int e;
399      __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
400      return e;
401    }
402    int next(int previous) const override {
403      int retval = previous + 1;
404      while (retval < end() && !is_set(retval))
405        ++retval;
406      return retval;
407    }
408#if KMP_OS_AIX
409    // On AIX, we don't have a way to get CPU(s) a thread is bound to.
410    // This routine is only used to get the full mask.
411    int get_system_affinity(bool abort_on_error) override {
412      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
413                  "Illegal get affinity operation when not capable");
414
415      (void)abort_on_error;
416
417      // Set the mask with all CPUs that are available.
418      for (int i = 0; i < __kmp_xproc; ++i)
419        KMP_CPU_SET(i, this);
420      return 0;
421    }
422    int set_system_affinity(bool abort_on_error) const override {
423      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
424
425                  "Illegal set affinity operation when not capable");
426
427      int location;
428      int gtid = __kmp_entry_gtid();
429      int tid = thread_self();
430
431      // Unbind the thread if it was bound to any processors before so that
432      // we can bind the thread to CPUs specified by the mask not others.
433      int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY);
434
435      // On AIX, we can only bind to one instead of a set of CPUs with the
436      // bindprocessor() system call.
437      KMP_CPU_SET_ITERATE(location, this) {
438        if (KMP_CPU_ISSET(location, this)) {
439          retval = bindprocessor(BINDTHREAD, tid, location);
440          if (retval == -1 && errno == 1) {
441            rsid_t rsid;
442            rsethandle_t rsh;
443            // Put something in rsh to prevent compiler warning
444            // about uninitalized use
445            rsh = rs_alloc(RS_EMPTY);
446            rsid.at_pid = getpid();
447            if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) {
448              retval = ra_detachrset(R_PROCESS, rsid, 0);
449              retval = bindprocessor(BINDTHREAD, tid, location);
450            }
451          }
452          if (retval == 0) {
453            KA_TRACE(10, ("__kmp_set_system_affinity:  Done binding "
454                          "T#%d to cpu=%d.\n",
455                          gtid, location));
456            continue;
457          }
458          int error = errno;
459          if (abort_on_error) {
460            __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"),
461                        KMP_ERR(error), __kmp_msg_null);
462            KA_TRACE(10, ("__kmp_set_system_affinity:  Error binding "
463                          "T#%d to cpu=%d, errno=%d.\n",
464                          gtid, location, error));
465            return error;
466          }
467        }
468      }
469      return 0;
470    }
471#else // !KMP_OS_AIX
472    int get_system_affinity(bool abort_on_error) override {
473      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
474                  "Illegal get affinity operation when not capable");
475#if KMP_OS_LINUX
476      long retval =
477          syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
478#elif KMP_OS_FREEBSD
479      int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
480                                     reinterpret_cast<cpuset_t *>(mask));
481      int retval = (r == 0 ? 0 : -1);
482#endif
483      if (retval >= 0) {
484        return 0;
485      }
486      int error = errno;
487      if (abort_on_error) {
488        __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
489                    KMP_ERR(error), __kmp_msg_null);
490      }
491      return error;
492    }
493    int set_system_affinity(bool abort_on_error) const override {
494      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
495                  "Illegal set affinity operation when not capable");
496#if KMP_OS_LINUX
497      long retval =
498          syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
499#elif KMP_OS_FREEBSD
500      int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
501                                     reinterpret_cast<cpuset_t *>(mask));
502      int retval = (r == 0 ? 0 : -1);
503#endif
504      if (retval >= 0) {
505        return 0;
506      }
507      int error = errno;
508      if (abort_on_error) {
509        __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
510                    KMP_ERR(error), __kmp_msg_null);
511      }
512      return error;
513    }
514#endif // KMP_OS_AIX
515  };
516  void determine_capable(const char *env_var) override {
517    __kmp_affinity_determine_capable(env_var);
518  }
519  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
520  KMPAffinity::Mask *allocate_mask() override {
521    KMPNativeAffinity::Mask *retval = new Mask();
522    return retval;
523  }
524  void deallocate_mask(KMPAffinity::Mask *m) override {
525    KMPNativeAffinity::Mask *native_mask =
526        static_cast<KMPNativeAffinity::Mask *>(m);
527    delete native_mask;
528  }
529  KMPAffinity::Mask *allocate_mask_array(int num) override {
530    return new Mask[num];
531  }
532  void deallocate_mask_array(KMPAffinity::Mask *array) override {
533    Mask *linux_array = static_cast<Mask *>(array);
534    delete[] linux_array;
535  }
536  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
537                                      int index) override {
538    Mask *linux_array = static_cast<Mask *>(array);
539    return &(linux_array[index]);
540  }
541  api_type get_api_type() const override { return NATIVE_OS; }
542};
543#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX */
544
545#if KMP_OS_WINDOWS
546class KMPNativeAffinity : public KMPAffinity {
547  class Mask : public KMPAffinity::Mask {
548    typedef ULONG_PTR mask_t;
549    static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
550    mask_t *mask;
551
552  public:
553    Mask() {
554      mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
555    }
556    ~Mask() {
557      if (mask)
558        __kmp_free(mask);
559    }
560    void set(int i) override {
561      mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
562    }
563    bool is_set(int i) const override {
564      return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
565    }
566    void clear(int i) override {
567      mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
568    }
569    void zero() override {
570      for (int i = 0; i < __kmp_num_proc_groups; ++i)
571        mask[i] = 0;
572    }
573    bool empty() const override {
574      for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
575        if (mask[i])
576          return false;
577      return true;
578    }
579    void copy(const KMPAffinity::Mask *src) override {
580      const Mask *convert = static_cast<const Mask *>(src);
581      for (int i = 0; i < __kmp_num_proc_groups; ++i)
582        mask[i] = convert->mask[i];
583    }
584    void bitwise_and(const KMPAffinity::Mask *rhs) override {
585      const Mask *convert = static_cast<const Mask *>(rhs);
586      for (int i = 0; i < __kmp_num_proc_groups; ++i)
587        mask[i] &= convert->mask[i];
588    }
589    void bitwise_or(const KMPAffinity::Mask *rhs) override {
590      const Mask *convert = static_cast<const Mask *>(rhs);
591      for (int i = 0; i < __kmp_num_proc_groups; ++i)
592        mask[i] |= convert->mask[i];
593    }
594    void bitwise_not() override {
595      for (int i = 0; i < __kmp_num_proc_groups; ++i)
596        mask[i] = ~(mask[i]);
597    }
598    bool is_equal(const KMPAffinity::Mask *rhs) const override {
599      const Mask *convert = static_cast<const Mask *>(rhs);
600      for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
601        if (mask[i] != convert->mask[i])
602          return false;
603      return true;
604    }
605    int begin() const override {
606      int retval = 0;
607      while (retval < end() && !is_set(retval))
608        ++retval;
609      return retval;
610    }
611    int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
612    int next(int previous) const override {
613      int retval = previous + 1;
614      while (retval < end() && !is_set(retval))
615        ++retval;
616      return retval;
617    }
618    int set_process_affinity(bool abort_on_error) const override {
619      if (__kmp_num_proc_groups <= 1) {
620        if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
621          DWORD error = GetLastError();
622          if (abort_on_error) {
623            __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
624                        __kmp_msg_null);
625          }
626          return error;
627        }
628      }
629      return 0;
630    }
631    int set_system_affinity(bool abort_on_error) const override {
632      if (__kmp_num_proc_groups > 1) {
633        // Check for a valid mask.
634        GROUP_AFFINITY ga;
635        int group = get_proc_group();
636        if (group < 0) {
637          if (abort_on_error) {
638            KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
639          }
640          return -1;
641        }
642        // Transform the bit vector into a GROUP_AFFINITY struct
643        // and make the system call to set affinity.
644        ga.Group = group;
645        ga.Mask = mask[group];
646        ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
647
648        KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
649        if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
650          DWORD error = GetLastError();
651          if (abort_on_error) {
652            __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
653                        __kmp_msg_null);
654          }
655          return error;
656        }
657      } else {
658        if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
659          DWORD error = GetLastError();
660          if (abort_on_error) {
661            __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
662                        __kmp_msg_null);
663          }
664          return error;
665        }
666      }
667      return 0;
668    }
669    int get_system_affinity(bool abort_on_error) override {
670      if (__kmp_num_proc_groups > 1) {
671        this->zero();
672        GROUP_AFFINITY ga;
673        KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
674        if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
675          DWORD error = GetLastError();
676          if (abort_on_error) {
677            __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
678                        KMP_ERR(error), __kmp_msg_null);
679          }
680          return error;
681        }
682        if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
683            (ga.Mask == 0)) {
684          return -1;
685        }
686        mask[ga.Group] = ga.Mask;
687      } else {
688        mask_t newMask, sysMask, retval;
689        if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
690          DWORD error = GetLastError();
691          if (abort_on_error) {
692            __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
693                        KMP_ERR(error), __kmp_msg_null);
694          }
695          return error;
696        }
697        retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
698        if (!retval) {
699          DWORD error = GetLastError();
700          if (abort_on_error) {
701            __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
702                        KMP_ERR(error), __kmp_msg_null);
703          }
704          return error;
705        }
706        newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
707        if (!newMask) {
708          DWORD error = GetLastError();
709          if (abort_on_error) {
710            __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
711                        KMP_ERR(error), __kmp_msg_null);
712          }
713        }
714        *mask = retval;
715      }
716      return 0;
717    }
718    int get_proc_group() const override {
719      int group = -1;
720      if (__kmp_num_proc_groups == 1) {
721        return 1;
722      }
723      for (int i = 0; i < __kmp_num_proc_groups; i++) {
724        if (mask[i] == 0)
725          continue;
726        if (group >= 0)
727          return -1;
728        group = i;
729      }
730      return group;
731    }
732  };
733  void determine_capable(const char *env_var) override {
734    __kmp_affinity_determine_capable(env_var);
735  }
736  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
737  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
738  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
739  KMPAffinity::Mask *allocate_mask_array(int num) override {
740    return new Mask[num];
741  }
742  void deallocate_mask_array(KMPAffinity::Mask *array) override {
743    Mask *windows_array = static_cast<Mask *>(array);
744    delete[] windows_array;
745  }
746  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
747                                      int index) override {
748    Mask *windows_array = static_cast<Mask *>(array);
749    return &(windows_array[index]);
750  }
751  api_type get_api_type() const override { return NATIVE_OS; }
752};
753#endif /* KMP_OS_WINDOWS */
754#endif /* KMP_AFFINITY_SUPPORTED */
755
756// Describe an attribute for a level in the machine topology
757struct kmp_hw_attr_t {
758  int core_type : 8;
759  int core_eff : 8;
760  unsigned valid : 1;
761  unsigned reserved : 15;
762
763  static const int UNKNOWN_CORE_EFF = -1;
764
765  kmp_hw_attr_t()
766      : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
767        valid(0), reserved(0) {}
768  void set_core_type(kmp_hw_core_type_t type) {
769    valid = 1;
770    core_type = type;
771  }
772  void set_core_eff(int eff) {
773    valid = 1;
774    core_eff = eff;
775  }
776  kmp_hw_core_type_t get_core_type() const {
777    return (kmp_hw_core_type_t)core_type;
778  }
779  int get_core_eff() const { return core_eff; }
780  bool is_core_type_valid() const {
781    return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
782  }
783  bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
784  operator bool() const { return valid; }
785  void clear() {
786    core_type = KMP_HW_CORE_TYPE_UNKNOWN;
787    core_eff = UNKNOWN_CORE_EFF;
788    valid = 0;
789  }
790  bool contains(const kmp_hw_attr_t &other) const {
791    if (!valid && !other.valid)
792      return true;
793    if (valid && other.valid) {
794      if (other.is_core_type_valid()) {
795        if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
796          return false;
797      }
798      if (other.is_core_eff_valid()) {
799        if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
800          return false;
801      }
802      return true;
803    }
804    return false;
805  }
806#if KMP_AFFINITY_SUPPORTED
807  bool contains(const kmp_affinity_attrs_t &attr) const {
808    if (!valid && !attr.valid)
809      return true;
810    if (valid && attr.valid) {
811      if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
812        return (is_core_type_valid() &&
813                (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
814      if (attr.core_eff != UNKNOWN_CORE_EFF)
815        return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
816      return true;
817    }
818    return false;
819  }
820#endif // KMP_AFFINITY_SUPPORTED
821  bool operator==(const kmp_hw_attr_t &rhs) const {
822    return (rhs.valid == valid && rhs.core_eff == core_eff &&
823            rhs.core_type == core_type);
824  }
825  bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
826};
827
828#if KMP_AFFINITY_SUPPORTED
829KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
830#endif
831
832class kmp_hw_thread_t {
833public:
834  static const int UNKNOWN_ID = -1;
835  static const int MULTIPLE_ID = -2;
836  static int compare_ids(const void *a, const void *b);
837  static int compare_compact(const void *a, const void *b);
838  int ids[KMP_HW_LAST];
839  int sub_ids[KMP_HW_LAST];
840  bool leader;
841  int os_id;
842  kmp_hw_attr_t attrs;
843
844  void print() const;
845  void clear() {
846    for (int i = 0; i < (int)KMP_HW_LAST; ++i)
847      ids[i] = UNKNOWN_ID;
848    leader = false;
849    attrs.clear();
850  }
851};
852
853class kmp_topology_t {
854
855  struct flags_t {
856    int uniform : 1;
857    int reserved : 31;
858  };
859
860  int depth;
861
862  // The following arrays are all 'depth' long and have been
863  // allocated to hold up to KMP_HW_LAST number of objects if
864  // needed so layers can be added without reallocation of any array
865
866  // Orderd array of the types in the topology
867  kmp_hw_t *types;
868
869  // Keep quick topology ratios, for non-uniform topologies,
870  // this ratio holds the max number of itemAs per itemB
871  // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
872  int *ratio;
873
874  // Storage containing the absolute number of each topology layer
875  int *count;
876
877  // The number of core efficiencies. This is only useful for hybrid
878  // topologies. Core efficiencies will range from 0 to num efficiencies - 1
879  int num_core_efficiencies;
880  int num_core_types;
881  kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
882
883  // The hardware threads array
884  // hw_threads is num_hw_threads long
885  // Each hw_thread's ids and sub_ids are depth deep
886  int num_hw_threads;
887  kmp_hw_thread_t *hw_threads;
888
889  // Equivalence hash where the key is the hardware topology item
890  // and the value is the equivalent hardware topology type in the
891  // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
892  // known equivalence for the topology type
893  kmp_hw_t equivalent[KMP_HW_LAST];
894
895  // Flags describing the topology
896  flags_t flags;
897
898  // Compact value used during sort_compact()
899  int compact;
900
901  // Insert a new topology layer after allocation
902  void _insert_layer(kmp_hw_t type, const int *ids);
903
904#if KMP_GROUP_AFFINITY
905  // Insert topology information about Windows Processor groups
906  void _insert_windows_proc_groups();
907#endif
908
909  // Count each item & get the num x's per y
910  // e.g., get the number of cores and the number of threads per core
911  // for each (x, y) in (KMP_HW_* , KMP_HW_*)
912  void _gather_enumeration_information();
913
914  // Remove layers that don't add information to the topology.
915  // This is done by having the layer take on the id = UNKNOWN_ID (-1)
916  void _remove_radix1_layers();
917
918  // Find out if the topology is uniform
919  void _discover_uniformity();
920
921  // Set all the sub_ids for each hardware thread
922  void _set_sub_ids();
923
924  // Set global affinity variables describing the number of threads per
925  // core, the number of packages, the number of cores per package, and
926  // the number of cores.
927  void _set_globals();
928
929  // Set the last level cache equivalent type
930  void _set_last_level_cache();
931
932  // Return the number of cores with a particular attribute, 'attr'.
933  // If 'find_all' is true, then find all cores on the machine, otherwise find
934  // all cores per the layer 'above'
935  int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
936                            bool find_all = false) const;
937
938public:
939  // Force use of allocate()/deallocate()
940  kmp_topology_t() = delete;
941  kmp_topology_t(const kmp_topology_t &t) = delete;
942  kmp_topology_t(kmp_topology_t &&t) = delete;
943  kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
944  kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
945
946  static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
947  static void deallocate(kmp_topology_t *);
948
949  // Functions used in create_map() routines
950  kmp_hw_thread_t &at(int index) {
951    KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
952    return hw_threads[index];
953  }
954  const kmp_hw_thread_t &at(int index) const {
955    KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
956    return hw_threads[index];
957  }
958  int get_num_hw_threads() const { return num_hw_threads; }
959  void sort_ids() {
960    qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
961          kmp_hw_thread_t::compare_ids);
962  }
963  // Check if the hardware ids are unique, if they are
964  // return true, otherwise return false
965  bool check_ids() const;
966
967  // Function to call after the create_map() routine
968  void canonicalize();
969  void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
970
971// Functions used after canonicalize() called
972
973#if KMP_AFFINITY_SUPPORTED
974  // Set the granularity for affinity settings
975  void set_granularity(kmp_affinity_t &stgs) const;
976  bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
977  bool restrict_to_mask(const kmp_affin_mask_t *mask);
978  bool filter_hw_subset();
979#endif
980  bool is_uniform() const { return flags.uniform; }
981  // Tell whether a type is a valid type in the topology
982  // returns KMP_HW_UNKNOWN when there is no equivalent type
983  kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
984    if (type == KMP_HW_UNKNOWN)
985      return KMP_HW_UNKNOWN;
986    return equivalent[type];
987  }
988  // Set type1 = type2
989  void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
990    KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
991    KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
992    kmp_hw_t real_type2 = equivalent[type2];
993    if (real_type2 == KMP_HW_UNKNOWN)
994      real_type2 = type2;
995    equivalent[type1] = real_type2;
996    // This loop is required since any of the types may have been set to
997    // be equivalent to type1.  They all must be checked and reset to type2.
998    KMP_FOREACH_HW_TYPE(type) {
999      if (equivalent[type] == type1) {
1000        equivalent[type] = real_type2;
1001      }
1002    }
1003  }
1004  // Calculate number of types corresponding to level1
1005  // per types corresponding to level2 (e.g., number of threads per core)
1006  int calculate_ratio(int level1, int level2) const {
1007    KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
1008    KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
1009    int r = 1;
1010    for (int level = level1; level > level2; --level)
1011      r *= ratio[level];
1012    return r;
1013  }
1014  int get_ratio(int level) const {
1015    KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1016    return ratio[level];
1017  }
1018  int get_depth() const { return depth; };
1019  kmp_hw_t get_type(int level) const {
1020    KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1021    return types[level];
1022  }
1023  int get_level(kmp_hw_t type) const {
1024    KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
1025    int eq_type = equivalent[type];
1026    if (eq_type == KMP_HW_UNKNOWN)
1027      return -1;
1028    for (int i = 0; i < depth; ++i)
1029      if (types[i] == eq_type)
1030        return i;
1031    return -1;
1032  }
1033  int get_count(int level) const {
1034    KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1035    return count[level];
1036  }
1037  // Return the total number of cores with attribute 'attr'
1038  int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
1039    return _get_ncores_with_attr(attr, -1, true);
1040  }
1041  // Return the number of cores with attribute
1042  // 'attr' per topology level 'above'
1043  int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
1044    return _get_ncores_with_attr(attr, above, false);
1045  }
1046
1047#if KMP_AFFINITY_SUPPORTED
1048  friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
1049  void sort_compact(kmp_affinity_t &affinity) {
1050    compact = affinity.compact;
1051    qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
1052          kmp_hw_thread_t::compare_compact);
1053  }
1054#endif
1055  void print(const char *env_var = "KMP_AFFINITY") const;
1056  void dump() const;
1057};
1058extern kmp_topology_t *__kmp_topology;
1059
1060class kmp_hw_subset_t {
1061  const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
1062
1063public:
1064  // Describe a machine topology item in KMP_HW_SUBSET
1065  struct item_t {
1066    kmp_hw_t type;
1067    int num_attrs;
1068    int num[MAX_ATTRS];
1069    int offset[MAX_ATTRS];
1070    kmp_hw_attr_t attr[MAX_ATTRS];
1071  };
1072  // Put parenthesis around max to avoid accidental use of Windows max macro.
1073  const static int USE_ALL = (std::numeric_limits<int>::max)();
1074
1075private:
1076  int depth;
1077  int capacity;
1078  item_t *items;
1079  kmp_uint64 set;
1080  bool absolute;
1081  // The set must be able to handle up to KMP_HW_LAST number of layers
1082  KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
1083  // Sorting the KMP_HW_SUBSET items to follow topology order
1084  // All unknown topology types will be at the beginning of the subset
1085  static int hw_subset_compare(const void *i1, const void *i2) {
1086    kmp_hw_t type1 = ((const item_t *)i1)->type;
1087    kmp_hw_t type2 = ((const item_t *)i2)->type;
1088    int level1 = __kmp_topology->get_level(type1);
1089    int level2 = __kmp_topology->get_level(type2);
1090    return level1 - level2;
1091  }
1092
1093public:
1094  // Force use of allocate()/deallocate()
1095  kmp_hw_subset_t() = delete;
1096  kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
1097  kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
1098  kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
1099  kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
1100
1101  static kmp_hw_subset_t *allocate() {
1102    int initial_capacity = 5;
1103    kmp_hw_subset_t *retval =
1104        (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
1105    retval->depth = 0;
1106    retval->capacity = initial_capacity;
1107    retval->set = 0ull;
1108    retval->absolute = false;
1109    retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
1110    return retval;
1111  }
1112  static void deallocate(kmp_hw_subset_t *subset) {
1113    __kmp_free(subset->items);
1114    __kmp_free(subset);
1115  }
1116  void set_absolute() { absolute = true; }
1117  bool is_absolute() const { return absolute; }
1118  void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
1119    for (int i = 0; i < depth; ++i) {
1120      // Found an existing item for this layer type
1121      // Add the num, offset, and attr to this item
1122      if (items[i].type == type) {
1123        int idx = items[i].num_attrs++;
1124        if ((size_t)idx >= MAX_ATTRS)
1125          return;
1126        items[i].num[idx] = num;
1127        items[i].offset[idx] = offset;
1128        items[i].attr[idx] = attr;
1129        return;
1130      }
1131    }
1132    if (depth == capacity - 1) {
1133      capacity *= 2;
1134      item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
1135      for (int i = 0; i < depth; ++i)
1136        new_items[i] = items[i];
1137      __kmp_free(items);
1138      items = new_items;
1139    }
1140    items[depth].num_attrs = 1;
1141    items[depth].type = type;
1142    items[depth].num[0] = num;
1143    items[depth].offset[0] = offset;
1144    items[depth].attr[0] = attr;
1145    depth++;
1146    set |= (1ull << type);
1147  }
1148  int get_depth() const { return depth; }
1149  const item_t &at(int index) const {
1150    KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1151    return items[index];
1152  }
1153  item_t &at(int index) {
1154    KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1155    return items[index];
1156  }
1157  void remove(int index) {
1158    KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1159    set &= ~(1ull << items[index].type);
1160    for (int j = index + 1; j < depth; ++j) {
1161      items[j - 1] = items[j];
1162    }
1163    depth--;
1164  }
1165  void sort() {
1166    KMP_DEBUG_ASSERT(__kmp_topology);
1167    qsort(items, depth, sizeof(item_t), hw_subset_compare);
1168  }
1169  bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1170  void dump() const {
1171    printf("**********************\n");
1172    printf("*** kmp_hw_subset: ***\n");
1173    printf("* depth: %d\n", depth);
1174    printf("* items:\n");
1175    for (int i = 0; i < depth; ++i) {
1176      printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
1177      for (int j = 0; j < items[i].num_attrs; ++j) {
1178        printf("  num: %d, offset: %d, attr: ", items[i].num[j],
1179               items[i].offset[j]);
1180        if (!items[i].attr[j]) {
1181          printf(" (none)\n");
1182        } else {
1183          printf(
1184              " core_type = %s, core_eff = %d\n",
1185              __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
1186              items[i].attr[j].get_core_eff());
1187        }
1188      }
1189    }
1190    printf("* set: 0x%llx\n", set);
1191    printf("* absolute: %d\n", absolute);
1192    printf("**********************\n");
1193  }
1194};
1195extern kmp_hw_subset_t *__kmp_hw_subset;
1196
1197/* A structure for holding machine-specific hierarchy info to be computed once
1198   at init. This structure represents a mapping of threads to the actual machine
1199   hierarchy, or to our best guess at what the hierarchy might be, for the
1200   purpose of performing an efficient barrier. In the worst case, when there is
1201   no machine hierarchy information, it produces a tree suitable for a barrier,
1202   similar to the tree used in the hyper barrier. */
1203class hierarchy_info {
1204public:
1205  /* Good default values for number of leaves and branching factor, given no
1206     affinity information. Behaves a bit like hyper barrier. */
1207  static const kmp_uint32 maxLeaves = 4;
1208  static const kmp_uint32 minBranch = 4;
1209  /** Number of levels in the hierarchy. Typical levels are threads/core,
1210      cores/package or socket, packages/node, nodes/machine, etc. We don't want
1211      to get specific with nomenclature. When the machine is oversubscribed we
1212      add levels to duplicate the hierarchy, doubling the thread capacity of the
1213      hierarchy each time we add a level. */
1214  kmp_uint32 maxLevels;
1215
1216  /** This is specifically the depth of the machine configuration hierarchy, in
1217      terms of the number of levels along the longest path from root to any
1218      leaf. It corresponds to the number of entries in numPerLevel if we exclude
1219      all but one trailing 1. */
1220  kmp_uint32 depth;
1221  kmp_uint32 base_num_threads;
1222  enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
1223  volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1224  // 2=initialization in progress
1225  volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1226
1227  /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
1228      the parent of a node at level i has. For example, if we have a machine
1229      with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
1230      {2, 4, 4, 1, 1}. All empty levels are set to 1. */
1231  kmp_uint32 *numPerLevel;
1232  kmp_uint32 *skipPerLevel;
1233
1234  void deriveLevels() {
1235    int hier_depth = __kmp_topology->get_depth();
1236    for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1237      numPerLevel[level] = __kmp_topology->get_ratio(i);
1238    }
1239  }
1240
1241  hierarchy_info()
1242      : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
1243
1244  void fini() {
1245    if (!uninitialized && numPerLevel) {
1246      __kmp_free(numPerLevel);
1247      numPerLevel = NULL;
1248      uninitialized = not_initialized;
1249    }
1250  }
1251
1252  void init(int num_addrs) {
1253    kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1254        &uninitialized, not_initialized, initializing);
1255    if (bool_result == 0) { // Wait for initialization
1256      while (TCR_1(uninitialized) != initialized)
1257        KMP_CPU_PAUSE();
1258      return;
1259    }
1260    KMP_DEBUG_ASSERT(bool_result == 1);
1261
1262    /* Added explicit initialization of the data fields here to prevent usage of
1263       dirty value observed when static library is re-initialized multiple times
1264       (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1265       OpenMP). */
1266    depth = 1;
1267    resizing = 0;
1268    maxLevels = 7;
1269    numPerLevel =
1270        (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1271    skipPerLevel = &(numPerLevel[maxLevels]);
1272    for (kmp_uint32 i = 0; i < maxLevels;
1273         ++i) { // init numPerLevel[*] to 1 item per level
1274      numPerLevel[i] = 1;
1275      skipPerLevel[i] = 1;
1276    }
1277
1278    // Sort table by physical ID
1279    if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1280      deriveLevels();
1281    } else {
1282      numPerLevel[0] = maxLeaves;
1283      numPerLevel[1] = num_addrs / maxLeaves;
1284      if (num_addrs % maxLeaves)
1285        numPerLevel[1]++;
1286    }
1287
1288    base_num_threads = num_addrs;
1289    for (int i = maxLevels - 1; i >= 0;
1290         --i) // count non-empty levels to get depth
1291      if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1292        depth++;
1293
1294    kmp_uint32 branch = minBranch;
1295    if (numPerLevel[0] == 1)
1296      branch = num_addrs / maxLeaves;
1297    if (branch < minBranch)
1298      branch = minBranch;
1299    for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1300      while (numPerLevel[d] > branch ||
1301             (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1302        if (numPerLevel[d] & 1)
1303          numPerLevel[d]++;
1304        numPerLevel[d] = numPerLevel[d] >> 1;
1305        if (numPerLevel[d + 1] == 1)
1306          depth++;
1307        numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1308      }
1309      if (numPerLevel[0] == 1) {
1310        branch = branch >> 1;
1311        if (branch < 4)
1312          branch = minBranch;
1313      }
1314    }
1315
1316    for (kmp_uint32 i = 1; i < depth; ++i)
1317      skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1318    // Fill in hierarchy in the case of oversubscription
1319    for (kmp_uint32 i = depth; i < maxLevels; ++i)
1320      skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1321
1322    uninitialized = initialized; // One writer
1323  }
1324
1325  // Resize the hierarchy if nproc changes to something larger than before
1326  void resize(kmp_uint32 nproc) {
1327    kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1328    while (bool_result == 0) { // someone else is trying to resize
1329      KMP_CPU_PAUSE();
1330      if (nproc <= base_num_threads) // happy with other thread's resize
1331        return;
1332      else // try to resize
1333        bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1334    }
1335    KMP_DEBUG_ASSERT(bool_result != 0);
1336    if (nproc <= base_num_threads)
1337      return; // happy with other thread's resize
1338
1339    // Calculate new maxLevels
1340    kmp_uint32 old_sz = skipPerLevel[depth - 1];
1341    kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1342    // First see if old maxLevels is enough to contain new size
1343    for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1344      skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1345      numPerLevel[i - 1] *= 2;
1346      old_sz *= 2;
1347      depth++;
1348    }
1349    if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1350      while (nproc > old_sz) {
1351        old_sz *= 2;
1352        incs++;
1353        depth++;
1354      }
1355      maxLevels += incs;
1356
1357      // Resize arrays
1358      kmp_uint32 *old_numPerLevel = numPerLevel;
1359      kmp_uint32 *old_skipPerLevel = skipPerLevel;
1360      numPerLevel = skipPerLevel = NULL;
1361      numPerLevel =
1362          (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1363      skipPerLevel = &(numPerLevel[maxLevels]);
1364
1365      // Copy old elements from old arrays
1366      for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1367        // init numPerLevel[*] to 1 item per level
1368        numPerLevel[i] = old_numPerLevel[i];
1369        skipPerLevel[i] = old_skipPerLevel[i];
1370      }
1371
1372      // Init new elements in arrays to 1
1373      for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1374        // init numPerLevel[*] to 1 item per level
1375        numPerLevel[i] = 1;
1376        skipPerLevel[i] = 1;
1377      }
1378
1379      // Free old arrays
1380      __kmp_free(old_numPerLevel);
1381    }
1382
1383    // Fill in oversubscription levels of hierarchy
1384    for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1385      skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1386
1387    base_num_threads = nproc;
1388    resizing = 0; // One writer
1389  }
1390};
1391#endif // KMP_AFFINITY_H
1392