1/*
2 * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#define __KMP_IMP
14#include "omp.h" /* extern "C" declarations of user-visible routines */
15#include "kmp.h"
16#include "kmp_error.h"
17#include "kmp_i18n.h"
18#include "kmp_itt.h"
19#include "kmp_lock.h"
20#include "kmp_stats.h"
21#include "ompt-specific.h"
22
23#define MAX_MESSAGE 512
24
25// flags will be used in future, e.g. to implement openmp_strict library
26// restrictions
27
28/*!
29 * @ingroup STARTUP_SHUTDOWN
30 * @param loc   in   source location information
31 * @param flags in   for future use (currently ignored)
32 *
33 * Initialize the runtime library. This call is optional; if it is not made then
34 * it will be implicitly called by attempts to use other library functions.
35 */
36void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37  // By default __kmpc_begin() is no-op.
38  char *env;
39  if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40      __kmp_str_match_true(env)) {
41    __kmp_middle_initialize();
42    __kmp_assign_root_init_mask();
43    KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
44  } else if (__kmp_ignore_mppbeg() == FALSE) {
45    // By default __kmp_ignore_mppbeg() returns TRUE.
46    __kmp_internal_begin();
47    KC_TRACE(10, ("__kmpc_begin: called\n"));
48  }
49}
50
51/*!
52 * @ingroup STARTUP_SHUTDOWN
53 * @param loc source location information
54 *
55 * Shutdown the runtime library. This is also optional, and even if called will
56 * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
57 * zero.
58 */
59void __kmpc_end(ident_t *loc) {
60  // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
61  // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
62  // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
63  // returns FALSE and __kmpc_end() will unregister this root (it can cause
64  // library shut down).
65  if (__kmp_ignore_mppend() == FALSE) {
66    KC_TRACE(10, ("__kmpc_end: called\n"));
67    KA_TRACE(30, ("__kmpc_end\n"));
68
69    __kmp_internal_end_thread(-1);
70  }
71#if KMP_OS_WINDOWS && OMPT_SUPPORT
72  // Normal exit process on Windows does not allow worker threads of the final
73  // parallel region to finish reporting their events, so shutting down the
74  // library here fixes the issue at least for the cases where __kmpc_end() is
75  // placed properly.
76  if (ompt_enabled.enabled)
77    __kmp_internal_end_library(__kmp_gtid_get_specific());
78#endif
79}
80
81/*!
82@ingroup THREAD_STATES
83@param loc Source location information.
84@return The global thread index of the active thread.
85
86This function can be called in any context.
87
88If the runtime has ony been entered at the outermost level from a
89single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
90that which would be returned by omp_get_thread_num() in the outermost
91active parallel construct. (Or zero if there is no active parallel
92construct, since the primary thread is necessarily thread zero).
93
94If multiple non-OpenMP threads all enter an OpenMP construct then this
95will be a unique thread identifier among all the threads created by
96the OpenMP runtime (but the value cannot be defined in terms of
97OpenMP thread ids returned by omp_get_thread_num()).
98*/
99kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
100  kmp_int32 gtid = __kmp_entry_gtid();
101
102  KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
103
104  return gtid;
105}
106
107/*!
108@ingroup THREAD_STATES
109@param loc Source location information.
110@return The number of threads under control of the OpenMP<sup>*</sup> runtime
111
112This function can be called in any context.
113It returns the total number of threads under the control of the OpenMP runtime.
114That is not a number that can be determined by any OpenMP standard calls, since
115the library may be called from more than one non-OpenMP thread, and this
116reflects the total over all such calls. Similarly the runtime maintains
117underlying threads even when they are not active (since the cost of creating
118and destroying OS threads is high), this call counts all such threads even if
119they are not waiting for work.
120*/
121kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
122  KC_TRACE(10,
123           ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
124
125  return TCR_4(__kmp_all_nth);
126}
127
128/*!
129@ingroup THREAD_STATES
130@param loc Source location information.
131@return The thread number of the calling thread in the innermost active parallel
132construct.
133*/
134kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
135  KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
136  return __kmp_tid_from_gtid(__kmp_entry_gtid());
137}
138
139/*!
140@ingroup THREAD_STATES
141@param loc Source location information.
142@return The number of threads in the innermost active parallel construct.
143*/
144kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
145  KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
146
147  return __kmp_entry_thread()->th.th_team->t.t_nproc;
148}
149
150/*!
151 * @ingroup DEPRECATED
152 * @param loc location description
153 *
154 * This function need not be called. It always returns TRUE.
155 */
156kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
157#ifndef KMP_DEBUG
158
159  return TRUE;
160
161#else
162
163  const char *semi2;
164  const char *semi3;
165  int line_no;
166
167  if (__kmp_par_range == 0) {
168    return TRUE;
169  }
170  semi2 = loc->psource;
171  if (semi2 == NULL) {
172    return TRUE;
173  }
174  semi2 = strchr(semi2, ';');
175  if (semi2 == NULL) {
176    return TRUE;
177  }
178  semi2 = strchr(semi2 + 1, ';');
179  if (semi2 == NULL) {
180    return TRUE;
181  }
182  if (__kmp_par_range_filename[0]) {
183    const char *name = semi2 - 1;
184    while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
185      name--;
186    }
187    if ((*name == '/') || (*name == ';')) {
188      name++;
189    }
190    if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
191      return __kmp_par_range < 0;
192    }
193  }
194  semi3 = strchr(semi2 + 1, ';');
195  if (__kmp_par_range_routine[0]) {
196    if ((semi3 != NULL) && (semi3 > semi2) &&
197        (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
198      return __kmp_par_range < 0;
199    }
200  }
201  if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
202    if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
203      return __kmp_par_range > 0;
204    }
205    return __kmp_par_range < 0;
206  }
207  return TRUE;
208
209#endif /* KMP_DEBUG */
210}
211
212/*!
213@ingroup THREAD_STATES
214@param loc Source location information.
215@return 1 if this thread is executing inside an active parallel region, zero if
216not.
217*/
218kmp_int32 __kmpc_in_parallel(ident_t *loc) {
219  return __kmp_entry_thread()->th.th_root->r.r_active;
220}
221
222/*!
223@ingroup PARALLEL
224@param loc source location information
225@param global_tid global thread number
226@param num_threads number of threads requested for this parallel construct
227
228Set the number of threads to be used by the next fork spawned by this thread.
229This call is only required if the parallel construct has a `num_threads` clause.
230*/
231void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
232                             kmp_int32 num_threads) {
233  KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
234                global_tid, num_threads));
235  __kmp_assert_valid_gtid(global_tid);
236  __kmp_push_num_threads(loc, global_tid, num_threads);
237}
238
239void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
240  KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
241  /* the num_threads are automatically popped */
242}
243
244void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
245                           kmp_int32 proc_bind) {
246  KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
247                proc_bind));
248  __kmp_assert_valid_gtid(global_tid);
249  __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
250}
251
252/*!
253@ingroup PARALLEL
254@param loc  source location information
255@param argc  total number of arguments in the ellipsis
256@param microtask  pointer to callback routine consisting of outlined parallel
257construct
258@param ...  pointers to shared variables that aren't global
259
260Do the actual fork and call the microtask in the relevant number of threads.
261*/
262void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
263  int gtid = __kmp_entry_gtid();
264
265#if (KMP_STATS_ENABLED)
266  // If we were in a serial region, then stop the serial timer, record
267  // the event, and start parallel region timer
268  stats_state_e previous_state = KMP_GET_THREAD_STATE();
269  if (previous_state == stats_state_e::SERIAL_REGION) {
270    KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
271  } else {
272    KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
273  }
274  int inParallel = __kmpc_in_parallel(loc);
275  if (inParallel) {
276    KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
277  } else {
278    KMP_COUNT_BLOCK(OMP_PARALLEL);
279  }
280#endif
281
282  // maybe to save thr_state is enough here
283  {
284    va_list ap;
285    va_start(ap, microtask);
286
287#if OMPT_SUPPORT
288    ompt_frame_t *ompt_frame;
289    if (ompt_enabled.enabled) {
290      kmp_info_t *master_th = __kmp_threads[gtid];
291      ompt_frame = &master_th->th.th_current_task->ompt_task_info.frame;
292      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
293    }
294    OMPT_STORE_RETURN_ADDRESS(gtid);
295#endif
296
297#if INCLUDE_SSC_MARKS
298    SSC_MARK_FORKING();
299#endif
300    __kmp_fork_call(loc, gtid, fork_context_intel, argc,
301                    VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
302                    VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
303                    kmp_va_addr_of(ap));
304#if INCLUDE_SSC_MARKS
305    SSC_MARK_JOINING();
306#endif
307    __kmp_join_call(loc, gtid
308#if OMPT_SUPPORT
309                    ,
310                    fork_context_intel
311#endif
312    );
313
314    va_end(ap);
315
316#if OMPT_SUPPORT
317    if (ompt_enabled.enabled) {
318      ompt_frame->enter_frame = ompt_data_none;
319    }
320#endif
321  }
322
323#if KMP_STATS_ENABLED
324  if (previous_state == stats_state_e::SERIAL_REGION) {
325    KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
326    KMP_SET_THREAD_STATE(previous_state);
327  } else {
328    KMP_POP_PARTITIONED_TIMER();
329  }
330#endif // KMP_STATS_ENABLED
331}
332
333/*!
334@ingroup PARALLEL
335@param loc  source location information
336@param microtask  pointer to callback routine consisting of outlined parallel
337construct
338@param cond  condition for running in parallel
339@param args  struct of pointers to shared variables that aren't global
340
341Perform a fork only if the condition is true.
342*/
343void __kmpc_fork_call_if(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
344                         kmp_int32 cond, void *args) {
345  int gtid = __kmp_entry_gtid();
346  if (cond) {
347    if (args)
348      __kmpc_fork_call(loc, argc, microtask, args);
349    else
350      __kmpc_fork_call(loc, argc, microtask);
351  } else {
352    __kmpc_serialized_parallel(loc, gtid);
353
354#if OMPT_SUPPORT
355    void *exit_frame_ptr;
356#endif
357
358    if (args)
359      __kmp_invoke_microtask(VOLATILE_CAST(microtask_t) microtask, gtid,
360                             /*npr=*/0,
361                             /*argc=*/1, &args
362#if OMPT_SUPPORT
363                             ,
364                             &exit_frame_ptr
365#endif
366      );
367    else
368      __kmp_invoke_microtask(VOLATILE_CAST(microtask_t) microtask, gtid,
369                             /*npr=*/0,
370                             /*argc=*/0,
371                             /*args=*/nullptr
372#if OMPT_SUPPORT
373                             ,
374                             &exit_frame_ptr
375#endif
376      );
377
378    __kmpc_end_serialized_parallel(loc, gtid);
379  }
380}
381
382/*!
383@ingroup PARALLEL
384@param loc source location information
385@param global_tid global thread number
386@param num_teams number of teams requested for the teams construct
387@param num_threads number of threads per team requested for the teams construct
388
389Set the number of teams to be used by the teams construct.
390This call is only required if the teams construct has a `num_teams` clause
391or a `thread_limit` clause (or both).
392*/
393void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
394                           kmp_int32 num_teams, kmp_int32 num_threads) {
395  KA_TRACE(20,
396           ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
397            global_tid, num_teams, num_threads));
398  __kmp_assert_valid_gtid(global_tid);
399  __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
400}
401
402/*!
403@ingroup PARALLEL
404@param loc source location information
405@param global_tid global thread number
406@param thread_limit limit on number of threads which can be created within the
407current task
408
409Set the thread_limit for the current task
410This call is there to support `thread_limit` clause on the `target` construct
411*/
412void __kmpc_set_thread_limit(ident_t *loc, kmp_int32 global_tid,
413                             kmp_int32 thread_limit) {
414  __kmp_assert_valid_gtid(global_tid);
415  kmp_info_t *thread = __kmp_threads[global_tid];
416  if (thread_limit > 0)
417    thread->th.th_current_task->td_icvs.task_thread_limit = thread_limit;
418}
419
420/*!
421@ingroup PARALLEL
422@param loc source location information
423@param global_tid global thread number
424@param num_teams_lb lower bound on number of teams requested for the teams
425construct
426@param num_teams_ub upper bound on number of teams requested for the teams
427construct
428@param num_threads number of threads per team requested for the teams construct
429
430Set the number of teams to be used by the teams construct. The number of initial
431teams cretaed will be greater than or equal to the lower bound and less than or
432equal to the upper bound.
433This call is only required if the teams construct has a `num_teams` clause
434or a `thread_limit` clause (or both).
435*/
436void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
437                              kmp_int32 num_teams_lb, kmp_int32 num_teams_ub,
438                              kmp_int32 num_threads) {
439  KA_TRACE(20, ("__kmpc_push_num_teams_51: enter T#%d num_teams_lb=%d"
440                " num_teams_ub=%d num_threads=%d\n",
441                global_tid, num_teams_lb, num_teams_ub, num_threads));
442  __kmp_assert_valid_gtid(global_tid);
443  __kmp_push_num_teams_51(loc, global_tid, num_teams_lb, num_teams_ub,
444                          num_threads);
445}
446
447/*!
448@ingroup PARALLEL
449@param loc  source location information
450@param argc  total number of arguments in the ellipsis
451@param microtask  pointer to callback routine consisting of outlined teams
452construct
453@param ...  pointers to shared variables that aren't global
454
455Do the actual fork and call the microtask in the relevant number of threads.
456*/
457void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
458                       ...) {
459  int gtid = __kmp_entry_gtid();
460  kmp_info_t *this_thr = __kmp_threads[gtid];
461  va_list ap;
462  va_start(ap, microtask);
463
464#if KMP_STATS_ENABLED
465  KMP_COUNT_BLOCK(OMP_TEAMS);
466  stats_state_e previous_state = KMP_GET_THREAD_STATE();
467  if (previous_state == stats_state_e::SERIAL_REGION) {
468    KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
469  } else {
470    KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
471  }
472#endif
473
474  // remember teams entry point and nesting level
475  this_thr->th.th_teams_microtask = microtask;
476  this_thr->th.th_teams_level =
477      this_thr->th.th_team->t.t_level; // AC: can be >0 on host
478
479#if OMPT_SUPPORT
480  kmp_team_t *parent_team = this_thr->th.th_team;
481  int tid = __kmp_tid_from_gtid(gtid);
482  if (ompt_enabled.enabled) {
483    parent_team->t.t_implicit_task_taskdata[tid]
484        .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
485  }
486  OMPT_STORE_RETURN_ADDRESS(gtid);
487#endif
488
489  // check if __kmpc_push_num_teams called, set default number of teams
490  // otherwise
491  if (this_thr->th.th_teams_size.nteams == 0) {
492    __kmp_push_num_teams(loc, gtid, 0, 0);
493  }
494  KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
495  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
496  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
497
498  __kmp_fork_call(
499      loc, gtid, fork_context_intel, argc,
500      VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
501      VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
502  __kmp_join_call(loc, gtid
503#if OMPT_SUPPORT
504                  ,
505                  fork_context_intel
506#endif
507  );
508
509  // Pop current CG root off list
510  KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
511  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
512  this_thr->th.th_cg_roots = tmp->up;
513  KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
514                 " to node %p. cg_nthreads was %d\n",
515                 this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
516  KMP_DEBUG_ASSERT(tmp->cg_nthreads);
517  int i = tmp->cg_nthreads--;
518  if (i == 1) { // check is we are the last thread in CG (not always the case)
519    __kmp_free(tmp);
520  }
521  // Restore current task's thread_limit from CG root
522  KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
523  this_thr->th.th_current_task->td_icvs.thread_limit =
524      this_thr->th.th_cg_roots->cg_thread_limit;
525
526  this_thr->th.th_teams_microtask = NULL;
527  this_thr->th.th_teams_level = 0;
528  *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
529  va_end(ap);
530#if KMP_STATS_ENABLED
531  if (previous_state == stats_state_e::SERIAL_REGION) {
532    KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
533    KMP_SET_THREAD_STATE(previous_state);
534  } else {
535    KMP_POP_PARTITIONED_TIMER();
536  }
537#endif // KMP_STATS_ENABLED
538}
539
540// I don't think this function should ever have been exported.
541// The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
542// openmp code ever called it, but it's been exported from the RTL for so
543// long that I'm afraid to remove the definition.
544int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
545
546/*!
547@ingroup PARALLEL
548@param loc  source location information
549@param global_tid  global thread number
550
551Enter a serialized parallel construct. This interface is used to handle a
552conditional parallel region, like this,
553@code
554#pragma omp parallel if (condition)
555@endcode
556when the condition is false.
557*/
558void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
559  // The implementation is now in kmp_runtime.cpp so that it can share static
560  // functions with kmp_fork_call since the tasks to be done are similar in
561  // each case.
562  __kmp_assert_valid_gtid(global_tid);
563#if OMPT_SUPPORT
564  OMPT_STORE_RETURN_ADDRESS(global_tid);
565#endif
566  __kmp_serialized_parallel(loc, global_tid);
567}
568
569/*!
570@ingroup PARALLEL
571@param loc  source location information
572@param global_tid  global thread number
573
574Leave a serialized parallel construct.
575*/
576void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
577  kmp_internal_control_t *top;
578  kmp_info_t *this_thr;
579  kmp_team_t *serial_team;
580
581  KC_TRACE(10,
582           ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
583
584  /* skip all this code for autopar serialized loops since it results in
585     unacceptable overhead */
586  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
587    return;
588
589  // Not autopar code
590  __kmp_assert_valid_gtid(global_tid);
591  if (!TCR_4(__kmp_init_parallel))
592    __kmp_parallel_initialize();
593
594  __kmp_resume_if_soft_paused();
595
596  this_thr = __kmp_threads[global_tid];
597  serial_team = this_thr->th.th_serial_team;
598
599  kmp_task_team_t *task_team = this_thr->th.th_task_team;
600  // we need to wait for the proxy tasks before finishing the thread
601  if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
602                            task_team->tt.tt_hidden_helper_task_encountered))
603    __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
604
605  KMP_MB();
606  KMP_DEBUG_ASSERT(serial_team);
607  KMP_ASSERT(serial_team->t.t_serialized);
608  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
609  KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
610  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
611  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
612
613#if OMPT_SUPPORT
614  if (ompt_enabled.enabled &&
615      this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
616    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
617    if (ompt_enabled.ompt_callback_implicit_task) {
618      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
619          ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
620          OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
621    }
622
623    // reset clear the task id only after unlinking the task
624    ompt_data_t *parent_task_data;
625    __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
626
627    if (ompt_enabled.ompt_callback_parallel_end) {
628      ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
629          &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
630          ompt_parallel_invoker_program | ompt_parallel_team,
631          OMPT_LOAD_RETURN_ADDRESS(global_tid));
632    }
633    __ompt_lw_taskteam_unlink(this_thr);
634    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
635  }
636#endif
637
638  /* If necessary, pop the internal control stack values and replace the team
639   * values */
640  top = serial_team->t.t_control_stack_top;
641  if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
642    copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
643    serial_team->t.t_control_stack_top = top->next;
644    __kmp_free(top);
645  }
646
647  /* pop dispatch buffers stack */
648  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
649  {
650    dispatch_private_info_t *disp_buffer =
651        serial_team->t.t_dispatch->th_disp_buffer;
652    serial_team->t.t_dispatch->th_disp_buffer =
653        serial_team->t.t_dispatch->th_disp_buffer->next;
654    __kmp_free(disp_buffer);
655  }
656  this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
657
658  --serial_team->t.t_serialized;
659  if (serial_team->t.t_serialized == 0) {
660
661    /* return to the parallel section */
662
663#if KMP_ARCH_X86 || KMP_ARCH_X86_64
664    if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
665      __kmp_clear_x87_fpu_status_word();
666      __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
667      __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
668    }
669#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
670
671    __kmp_pop_current_task_from_thread(this_thr);
672#if OMPD_SUPPORT
673    if (ompd_state & OMPD_ENABLE_BP)
674      ompd_bp_parallel_end();
675#endif
676
677    this_thr->th.th_team = serial_team->t.t_parent;
678    this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
679
680    /* restore values cached in the thread */
681    this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
682    this_thr->th.th_team_master =
683        serial_team->t.t_parent->t.t_threads[0]; /* JPH */
684    this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
685
686    /* TODO the below shouldn't need to be adjusted for serialized teams */
687    this_thr->th.th_dispatch =
688        &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
689
690    KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
691    this_thr->th.th_current_task->td_flags.executing = 1;
692
693    if (__kmp_tasking_mode != tskm_immediate_exec) {
694      // Copy the task team from the new child / old parent team to the thread.
695      this_thr->th.th_task_team =
696          this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
697      KA_TRACE(20,
698               ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
699                "team %p\n",
700                global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
701    }
702#if KMP_AFFINITY_SUPPORTED
703    if (this_thr->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
704      __kmp_reset_root_init_mask(global_tid);
705    }
706#endif
707  } else {
708    if (__kmp_tasking_mode != tskm_immediate_exec) {
709      KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
710                    "depth of serial team %p to %d\n",
711                    global_tid, serial_team, serial_team->t.t_serialized));
712    }
713  }
714
715  serial_team->t.t_level--;
716  if (__kmp_env_consistency_check)
717    __kmp_pop_parallel(global_tid, NULL);
718#if OMPT_SUPPORT
719  if (ompt_enabled.enabled)
720    this_thr->th.ompt_thread_info.state =
721        ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
722                                           : ompt_state_work_parallel);
723#endif
724}
725
726/*!
727@ingroup SYNCHRONIZATION
728@param loc  source location information.
729
730Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
731depending on the memory ordering convention obeyed by the compiler
732even that may not be necessary).
733*/
734void __kmpc_flush(ident_t *loc) {
735  KC_TRACE(10, ("__kmpc_flush: called\n"));
736
737  /* need explicit __mf() here since use volatile instead in library */
738  KMP_MFENCE(); /* Flush all pending memory write invalidates.  */
739
740#if OMPT_SUPPORT && OMPT_OPTIONAL
741  if (ompt_enabled.ompt_callback_flush) {
742    ompt_callbacks.ompt_callback(ompt_callback_flush)(
743        __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
744  }
745#endif
746}
747
748/* -------------------------------------------------------------------------- */
749/*!
750@ingroup SYNCHRONIZATION
751@param loc source location information
752@param global_tid thread id.
753
754Execute a barrier.
755*/
756void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
757  KMP_COUNT_BLOCK(OMP_BARRIER);
758  KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
759  __kmp_assert_valid_gtid(global_tid);
760
761  if (!TCR_4(__kmp_init_parallel))
762    __kmp_parallel_initialize();
763
764  __kmp_resume_if_soft_paused();
765
766  if (__kmp_env_consistency_check) {
767    if (loc == 0) {
768      KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
769    }
770    __kmp_check_barrier(global_tid, ct_barrier, loc);
771  }
772
773#if OMPT_SUPPORT
774  ompt_frame_t *ompt_frame;
775  if (ompt_enabled.enabled) {
776    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
777    if (ompt_frame->enter_frame.ptr == NULL)
778      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
779  }
780  OMPT_STORE_RETURN_ADDRESS(global_tid);
781#endif
782  __kmp_threads[global_tid]->th.th_ident = loc;
783  // TODO: explicit barrier_wait_id:
784  //   this function is called when 'barrier' directive is present or
785  //   implicit barrier at the end of a worksharing construct.
786  // 1) better to add a per-thread barrier counter to a thread data structure
787  // 2) set to 0 when a new team is created
788  // 4) no sync is required
789
790  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
791#if OMPT_SUPPORT && OMPT_OPTIONAL
792  if (ompt_enabled.enabled) {
793    ompt_frame->enter_frame = ompt_data_none;
794  }
795#endif
796}
797
798/* The BARRIER for a MASTER section is always explicit   */
799/*!
800@ingroup WORK_SHARING
801@param loc  source location information.
802@param global_tid  global thread number .
803@return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
804*/
805kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
806  int status = 0;
807
808  KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
809  __kmp_assert_valid_gtid(global_tid);
810
811  if (!TCR_4(__kmp_init_parallel))
812    __kmp_parallel_initialize();
813
814  __kmp_resume_if_soft_paused();
815
816  if (KMP_MASTER_GTID(global_tid)) {
817    KMP_COUNT_BLOCK(OMP_MASTER);
818    KMP_PUSH_PARTITIONED_TIMER(OMP_master);
819    status = 1;
820  }
821
822#if OMPT_SUPPORT && OMPT_OPTIONAL
823  if (status) {
824    if (ompt_enabled.ompt_callback_masked) {
825      kmp_info_t *this_thr = __kmp_threads[global_tid];
826      kmp_team_t *team = this_thr->th.th_team;
827
828      int tid = __kmp_tid_from_gtid(global_tid);
829      ompt_callbacks.ompt_callback(ompt_callback_masked)(
830          ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
831          &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
832          OMPT_GET_RETURN_ADDRESS(0));
833    }
834  }
835#endif
836
837  if (__kmp_env_consistency_check) {
838#if KMP_USE_DYNAMIC_LOCK
839    if (status)
840      __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
841    else
842      __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
843#else
844    if (status)
845      __kmp_push_sync(global_tid, ct_master, loc, NULL);
846    else
847      __kmp_check_sync(global_tid, ct_master, loc, NULL);
848#endif
849  }
850
851  return status;
852}
853
854/*!
855@ingroup WORK_SHARING
856@param loc  source location information.
857@param global_tid  global thread number .
858
859Mark the end of a <tt>master</tt> region. This should only be called by the
860thread that executes the <tt>master</tt> region.
861*/
862void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
863  KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
864  __kmp_assert_valid_gtid(global_tid);
865  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
866  KMP_POP_PARTITIONED_TIMER();
867
868#if OMPT_SUPPORT && OMPT_OPTIONAL
869  kmp_info_t *this_thr = __kmp_threads[global_tid];
870  kmp_team_t *team = this_thr->th.th_team;
871  if (ompt_enabled.ompt_callback_masked) {
872    int tid = __kmp_tid_from_gtid(global_tid);
873    ompt_callbacks.ompt_callback(ompt_callback_masked)(
874        ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
875        &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
876        OMPT_GET_RETURN_ADDRESS(0));
877  }
878#endif
879
880  if (__kmp_env_consistency_check) {
881    if (KMP_MASTER_GTID(global_tid))
882      __kmp_pop_sync(global_tid, ct_master, loc);
883  }
884}
885
886/*!
887@ingroup WORK_SHARING
888@param loc  source location information.
889@param global_tid  global thread number.
890@param filter result of evaluating filter clause on thread global_tid, or zero
891if no filter clause present
892@return 1 if this thread should execute the <tt>masked</tt> block, 0 otherwise.
893*/
894kmp_int32 __kmpc_masked(ident_t *loc, kmp_int32 global_tid, kmp_int32 filter) {
895  int status = 0;
896  int tid;
897  KC_TRACE(10, ("__kmpc_masked: called T#%d\n", global_tid));
898  __kmp_assert_valid_gtid(global_tid);
899
900  if (!TCR_4(__kmp_init_parallel))
901    __kmp_parallel_initialize();
902
903  __kmp_resume_if_soft_paused();
904
905  tid = __kmp_tid_from_gtid(global_tid);
906  if (tid == filter) {
907    KMP_COUNT_BLOCK(OMP_MASKED);
908    KMP_PUSH_PARTITIONED_TIMER(OMP_masked);
909    status = 1;
910  }
911
912#if OMPT_SUPPORT && OMPT_OPTIONAL
913  if (status) {
914    if (ompt_enabled.ompt_callback_masked) {
915      kmp_info_t *this_thr = __kmp_threads[global_tid];
916      kmp_team_t *team = this_thr->th.th_team;
917      ompt_callbacks.ompt_callback(ompt_callback_masked)(
918          ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
919          &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
920          OMPT_GET_RETURN_ADDRESS(0));
921    }
922  }
923#endif
924
925  if (__kmp_env_consistency_check) {
926#if KMP_USE_DYNAMIC_LOCK
927    if (status)
928      __kmp_push_sync(global_tid, ct_masked, loc, NULL, 0);
929    else
930      __kmp_check_sync(global_tid, ct_masked, loc, NULL, 0);
931#else
932    if (status)
933      __kmp_push_sync(global_tid, ct_masked, loc, NULL);
934    else
935      __kmp_check_sync(global_tid, ct_masked, loc, NULL);
936#endif
937  }
938
939  return status;
940}
941
942/*!
943@ingroup WORK_SHARING
944@param loc  source location information.
945@param global_tid  global thread number .
946
947Mark the end of a <tt>masked</tt> region. This should only be called by the
948thread that executes the <tt>masked</tt> region.
949*/
950void __kmpc_end_masked(ident_t *loc, kmp_int32 global_tid) {
951  KC_TRACE(10, ("__kmpc_end_masked: called T#%d\n", global_tid));
952  __kmp_assert_valid_gtid(global_tid);
953  KMP_POP_PARTITIONED_TIMER();
954
955#if OMPT_SUPPORT && OMPT_OPTIONAL
956  kmp_info_t *this_thr = __kmp_threads[global_tid];
957  kmp_team_t *team = this_thr->th.th_team;
958  if (ompt_enabled.ompt_callback_masked) {
959    int tid = __kmp_tid_from_gtid(global_tid);
960    ompt_callbacks.ompt_callback(ompt_callback_masked)(
961        ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
962        &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
963        OMPT_GET_RETURN_ADDRESS(0));
964  }
965#endif
966
967  if (__kmp_env_consistency_check) {
968    __kmp_pop_sync(global_tid, ct_masked, loc);
969  }
970}
971
972/*!
973@ingroup WORK_SHARING
974@param loc  source location information.
975@param gtid  global thread number.
976
977Start execution of an <tt>ordered</tt> construct.
978*/
979void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
980  int cid = 0;
981  kmp_info_t *th;
982  KMP_DEBUG_ASSERT(__kmp_init_serial);
983
984  KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
985  __kmp_assert_valid_gtid(gtid);
986
987  if (!TCR_4(__kmp_init_parallel))
988    __kmp_parallel_initialize();
989
990  __kmp_resume_if_soft_paused();
991
992#if USE_ITT_BUILD
993  __kmp_itt_ordered_prep(gtid);
994// TODO: ordered_wait_id
995#endif /* USE_ITT_BUILD */
996
997  th = __kmp_threads[gtid];
998
999#if OMPT_SUPPORT && OMPT_OPTIONAL
1000  kmp_team_t *team;
1001  ompt_wait_id_t lck;
1002  void *codeptr_ra;
1003  OMPT_STORE_RETURN_ADDRESS(gtid);
1004  if (ompt_enabled.enabled) {
1005    team = __kmp_team_from_gtid(gtid);
1006    lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
1007    /* OMPT state update */
1008    th->th.ompt_thread_info.wait_id = lck;
1009    th->th.ompt_thread_info.state = ompt_state_wait_ordered;
1010
1011    /* OMPT event callback */
1012    codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1013    if (ompt_enabled.ompt_callback_mutex_acquire) {
1014      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1015          ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
1016          codeptr_ra);
1017    }
1018  }
1019#endif
1020
1021  if (th->th.th_dispatch->th_deo_fcn != 0)
1022    (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
1023  else
1024    __kmp_parallel_deo(&gtid, &cid, loc);
1025
1026#if OMPT_SUPPORT && OMPT_OPTIONAL
1027  if (ompt_enabled.enabled) {
1028    /* OMPT state update */
1029    th->th.ompt_thread_info.state = ompt_state_work_parallel;
1030    th->th.ompt_thread_info.wait_id = 0;
1031
1032    /* OMPT event callback */
1033    if (ompt_enabled.ompt_callback_mutex_acquired) {
1034      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1035          ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1036    }
1037  }
1038#endif
1039
1040#if USE_ITT_BUILD
1041  __kmp_itt_ordered_start(gtid);
1042#endif /* USE_ITT_BUILD */
1043}
1044
1045/*!
1046@ingroup WORK_SHARING
1047@param loc  source location information.
1048@param gtid  global thread number.
1049
1050End execution of an <tt>ordered</tt> construct.
1051*/
1052void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
1053  int cid = 0;
1054  kmp_info_t *th;
1055
1056  KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
1057  __kmp_assert_valid_gtid(gtid);
1058
1059#if USE_ITT_BUILD
1060  __kmp_itt_ordered_end(gtid);
1061// TODO: ordered_wait_id
1062#endif /* USE_ITT_BUILD */
1063
1064  th = __kmp_threads[gtid];
1065
1066  if (th->th.th_dispatch->th_dxo_fcn != 0)
1067    (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
1068  else
1069    __kmp_parallel_dxo(&gtid, &cid, loc);
1070
1071#if OMPT_SUPPORT && OMPT_OPTIONAL
1072  OMPT_STORE_RETURN_ADDRESS(gtid);
1073  if (ompt_enabled.ompt_callback_mutex_released) {
1074    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1075        ompt_mutex_ordered,
1076        (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
1077            ->t.t_ordered.dt.t_value,
1078        OMPT_LOAD_RETURN_ADDRESS(gtid));
1079  }
1080#endif
1081}
1082
1083#if KMP_USE_DYNAMIC_LOCK
1084
1085static __forceinline void
1086__kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
1087                          kmp_int32 gtid, kmp_indirect_locktag_t tag) {
1088  // Pointer to the allocated indirect lock is written to crit, while indexing
1089  // is ignored.
1090  void *idx;
1091  kmp_indirect_lock_t **lck;
1092  lck = (kmp_indirect_lock_t **)crit;
1093  kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
1094  KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
1095  KMP_SET_I_LOCK_LOCATION(ilk, loc);
1096  KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
1097  KA_TRACE(20,
1098           ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
1099#if USE_ITT_BUILD
1100  __kmp_itt_critical_creating(ilk->lock, loc);
1101#endif
1102  int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
1103  if (status == 0) {
1104#if USE_ITT_BUILD
1105    __kmp_itt_critical_destroyed(ilk->lock);
1106#endif
1107    // We don't really need to destroy the unclaimed lock here since it will be
1108    // cleaned up at program exit.
1109    // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
1110  }
1111  KMP_DEBUG_ASSERT(*lck != NULL);
1112}
1113
1114// Fast-path acquire tas lock
1115#define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
1116  {                                                                            \
1117    kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1118    kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1119    kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1120    if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
1121        !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
1122      kmp_uint32 spins;                                                        \
1123      KMP_FSYNC_PREPARE(l);                                                    \
1124      KMP_INIT_YIELD(spins);                                                   \
1125      kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
1126      do {                                                                     \
1127        if (TCR_4(__kmp_nth) >                                                 \
1128            (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1129          KMP_YIELD(TRUE);                                                     \
1130        } else {                                                               \
1131          KMP_YIELD_SPIN(spins);                                               \
1132        }                                                                      \
1133        __kmp_spin_backoff(&backoff);                                          \
1134      } while (                                                                \
1135          KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1136          !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1137    }                                                                          \
1138    KMP_FSYNC_ACQUIRED(l);                                                     \
1139  }
1140
1141// Fast-path test tas lock
1142#define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1143  {                                                                            \
1144    kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1145    kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1146    kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1147    rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1148         __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1149  }
1150
1151// Fast-path release tas lock
1152#define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1153  { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1154
1155#if KMP_USE_FUTEX
1156
1157#include <sys/syscall.h>
1158#include <unistd.h>
1159#ifndef FUTEX_WAIT
1160#define FUTEX_WAIT 0
1161#endif
1162#ifndef FUTEX_WAKE
1163#define FUTEX_WAKE 1
1164#endif
1165
1166// Fast-path acquire futex lock
1167#define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1168  {                                                                            \
1169    kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1170    kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1171    KMP_MB();                                                                  \
1172    KMP_FSYNC_PREPARE(ftx);                                                    \
1173    kmp_int32 poll_val;                                                        \
1174    while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1175                &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1176                KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1177      kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1178      if (!cond) {                                                             \
1179        if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1180                                         poll_val |                            \
1181                                             KMP_LOCK_BUSY(1, futex))) {       \
1182          continue;                                                            \
1183        }                                                                      \
1184        poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1185      }                                                                        \
1186      kmp_int32 rc;                                                            \
1187      if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1188                        NULL, NULL, 0)) != 0) {                                \
1189        continue;                                                              \
1190      }                                                                        \
1191      gtid_code |= 1;                                                          \
1192    }                                                                          \
1193    KMP_FSYNC_ACQUIRED(ftx);                                                   \
1194  }
1195
1196// Fast-path test futex lock
1197#define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1198  {                                                                            \
1199    kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1200    if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1201                                    KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1202      KMP_FSYNC_ACQUIRED(ftx);                                                 \
1203      rc = TRUE;                                                               \
1204    } else {                                                                   \
1205      rc = FALSE;                                                              \
1206    }                                                                          \
1207  }
1208
1209// Fast-path release futex lock
1210#define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1211  {                                                                            \
1212    kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1213    KMP_MB();                                                                  \
1214    KMP_FSYNC_RELEASING(ftx);                                                  \
1215    kmp_int32 poll_val =                                                       \
1216        KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1217    if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1218      syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1219              KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1220    }                                                                          \
1221    KMP_MB();                                                                  \
1222    KMP_YIELD_OVERSUB();                                                       \
1223  }
1224
1225#endif // KMP_USE_FUTEX
1226
1227#else // KMP_USE_DYNAMIC_LOCK
1228
1229static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1230                                                      ident_t const *loc,
1231                                                      kmp_int32 gtid) {
1232  kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1233
1234  // Because of the double-check, the following load doesn't need to be volatile
1235  kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1236
1237  if (lck == NULL) {
1238    void *idx;
1239
1240    // Allocate & initialize the lock.
1241    // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1242    lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1243    __kmp_init_user_lock_with_checks(lck);
1244    __kmp_set_user_lock_location(lck, loc);
1245#if USE_ITT_BUILD
1246    __kmp_itt_critical_creating(lck);
1247// __kmp_itt_critical_creating() should be called *before* the first usage
1248// of underlying lock. It is the only place where we can guarantee it. There
1249// are chances the lock will destroyed with no usage, but it is not a
1250// problem, because this is not real event seen by user but rather setting
1251// name for object (lock). See more details in kmp_itt.h.
1252#endif /* USE_ITT_BUILD */
1253
1254    // Use a cmpxchg instruction to slam the start of the critical section with
1255    // the lock pointer.  If another thread beat us to it, deallocate the lock,
1256    // and use the lock that the other thread allocated.
1257    int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1258
1259    if (status == 0) {
1260// Deallocate the lock and reload the value.
1261#if USE_ITT_BUILD
1262      __kmp_itt_critical_destroyed(lck);
1263// Let ITT know the lock is destroyed and the same memory location may be reused
1264// for another purpose.
1265#endif /* USE_ITT_BUILD */
1266      __kmp_destroy_user_lock_with_checks(lck);
1267      __kmp_user_lock_free(&idx, gtid, lck);
1268      lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1269      KMP_DEBUG_ASSERT(lck != NULL);
1270    }
1271  }
1272  return lck;
1273}
1274
1275#endif // KMP_USE_DYNAMIC_LOCK
1276
1277/*!
1278@ingroup WORK_SHARING
1279@param loc  source location information.
1280@param global_tid  global thread number.
1281@param crit identity of the critical section. This could be a pointer to a lock
1282associated with the critical section, or some other suitably unique value.
1283
1284Enter code protected by a `critical` construct.
1285This function blocks until the executing thread can enter the critical section.
1286*/
1287void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1288                     kmp_critical_name *crit) {
1289#if KMP_USE_DYNAMIC_LOCK
1290#if OMPT_SUPPORT && OMPT_OPTIONAL
1291  OMPT_STORE_RETURN_ADDRESS(global_tid);
1292#endif // OMPT_SUPPORT
1293  __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1294#else
1295  KMP_COUNT_BLOCK(OMP_CRITICAL);
1296#if OMPT_SUPPORT && OMPT_OPTIONAL
1297  ompt_state_t prev_state = ompt_state_undefined;
1298  ompt_thread_info_t ti;
1299#endif
1300  kmp_user_lock_p lck;
1301
1302  KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1303  __kmp_assert_valid_gtid(global_tid);
1304
1305  // TODO: add THR_OVHD_STATE
1306
1307  KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1308  KMP_CHECK_USER_LOCK_INIT();
1309
1310  if ((__kmp_user_lock_kind == lk_tas) &&
1311      (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1312    lck = (kmp_user_lock_p)crit;
1313  }
1314#if KMP_USE_FUTEX
1315  else if ((__kmp_user_lock_kind == lk_futex) &&
1316           (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1317    lck = (kmp_user_lock_p)crit;
1318  }
1319#endif
1320  else { // ticket, queuing or drdpa
1321    lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1322  }
1323
1324  if (__kmp_env_consistency_check)
1325    __kmp_push_sync(global_tid, ct_critical, loc, lck);
1326
1327    // since the critical directive binds to all threads, not just the current
1328    // team we have to check this even if we are in a serialized team.
1329    // also, even if we are the uber thread, we still have to conduct the lock,
1330    // as we have to contend with sibling threads.
1331
1332#if USE_ITT_BUILD
1333  __kmp_itt_critical_acquiring(lck);
1334#endif /* USE_ITT_BUILD */
1335#if OMPT_SUPPORT && OMPT_OPTIONAL
1336  OMPT_STORE_RETURN_ADDRESS(gtid);
1337  void *codeptr_ra = NULL;
1338  if (ompt_enabled.enabled) {
1339    ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1340    /* OMPT state update */
1341    prev_state = ti.state;
1342    ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1343    ti.state = ompt_state_wait_critical;
1344
1345    /* OMPT event callback */
1346    codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1347    if (ompt_enabled.ompt_callback_mutex_acquire) {
1348      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1349          ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1350          (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1351    }
1352  }
1353#endif
1354  // Value of 'crit' should be good for using as a critical_id of the critical
1355  // section directive.
1356  __kmp_acquire_user_lock_with_checks(lck, global_tid);
1357
1358#if USE_ITT_BUILD
1359  __kmp_itt_critical_acquired(lck);
1360#endif /* USE_ITT_BUILD */
1361#if OMPT_SUPPORT && OMPT_OPTIONAL
1362  if (ompt_enabled.enabled) {
1363    /* OMPT state update */
1364    ti.state = prev_state;
1365    ti.wait_id = 0;
1366
1367    /* OMPT event callback */
1368    if (ompt_enabled.ompt_callback_mutex_acquired) {
1369      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1370          ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1371    }
1372  }
1373#endif
1374  KMP_POP_PARTITIONED_TIMER();
1375
1376  KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1377  KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1378#endif // KMP_USE_DYNAMIC_LOCK
1379}
1380
1381#if KMP_USE_DYNAMIC_LOCK
1382
1383// Converts the given hint to an internal lock implementation
1384static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1385#if KMP_USE_TSX
1386#define KMP_TSX_LOCK(seq) lockseq_##seq
1387#else
1388#define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1389#endif
1390
1391#if KMP_ARCH_X86 || KMP_ARCH_X86_64
1392#define KMP_CPUINFO_RTM (__kmp_cpuinfo.flags.rtm)
1393#else
1394#define KMP_CPUINFO_RTM 0
1395#endif
1396
1397  // Hints that do not require further logic
1398  if (hint & kmp_lock_hint_hle)
1399    return KMP_TSX_LOCK(hle);
1400  if (hint & kmp_lock_hint_rtm)
1401    return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq;
1402  if (hint & kmp_lock_hint_adaptive)
1403    return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1404
1405  // Rule out conflicting hints first by returning the default lock
1406  if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1407    return __kmp_user_lock_seq;
1408  if ((hint & omp_lock_hint_speculative) &&
1409      (hint & omp_lock_hint_nonspeculative))
1410    return __kmp_user_lock_seq;
1411
1412  // Do not even consider speculation when it appears to be contended
1413  if (hint & omp_lock_hint_contended)
1414    return lockseq_queuing;
1415
1416  // Uncontended lock without speculation
1417  if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1418    return lockseq_tas;
1419
1420  // Use RTM lock for speculation
1421  if (hint & omp_lock_hint_speculative)
1422    return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq;
1423
1424  return __kmp_user_lock_seq;
1425}
1426
1427#if OMPT_SUPPORT && OMPT_OPTIONAL
1428#if KMP_USE_DYNAMIC_LOCK
1429static kmp_mutex_impl_t
1430__ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1431  if (user_lock) {
1432    switch (KMP_EXTRACT_D_TAG(user_lock)) {
1433    case 0:
1434      break;
1435#if KMP_USE_FUTEX
1436    case locktag_futex:
1437      return kmp_mutex_impl_queuing;
1438#endif
1439    case locktag_tas:
1440      return kmp_mutex_impl_spin;
1441#if KMP_USE_TSX
1442    case locktag_hle:
1443    case locktag_rtm_spin:
1444      return kmp_mutex_impl_speculative;
1445#endif
1446    default:
1447      return kmp_mutex_impl_none;
1448    }
1449    ilock = KMP_LOOKUP_I_LOCK(user_lock);
1450  }
1451  KMP_ASSERT(ilock);
1452  switch (ilock->type) {
1453#if KMP_USE_TSX
1454  case locktag_adaptive:
1455  case locktag_rtm_queuing:
1456    return kmp_mutex_impl_speculative;
1457#endif
1458  case locktag_nested_tas:
1459    return kmp_mutex_impl_spin;
1460#if KMP_USE_FUTEX
1461  case locktag_nested_futex:
1462#endif
1463  case locktag_ticket:
1464  case locktag_queuing:
1465  case locktag_drdpa:
1466  case locktag_nested_ticket:
1467  case locktag_nested_queuing:
1468  case locktag_nested_drdpa:
1469    return kmp_mutex_impl_queuing;
1470  default:
1471    return kmp_mutex_impl_none;
1472  }
1473}
1474#else
1475// For locks without dynamic binding
1476static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1477  switch (__kmp_user_lock_kind) {
1478  case lk_tas:
1479    return kmp_mutex_impl_spin;
1480#if KMP_USE_FUTEX
1481  case lk_futex:
1482#endif
1483  case lk_ticket:
1484  case lk_queuing:
1485  case lk_drdpa:
1486    return kmp_mutex_impl_queuing;
1487#if KMP_USE_TSX
1488  case lk_hle:
1489  case lk_rtm_queuing:
1490  case lk_rtm_spin:
1491  case lk_adaptive:
1492    return kmp_mutex_impl_speculative;
1493#endif
1494  default:
1495    return kmp_mutex_impl_none;
1496  }
1497}
1498#endif // KMP_USE_DYNAMIC_LOCK
1499#endif // OMPT_SUPPORT && OMPT_OPTIONAL
1500
1501/*!
1502@ingroup WORK_SHARING
1503@param loc  source location information.
1504@param global_tid  global thread number.
1505@param crit identity of the critical section. This could be a pointer to a lock
1506associated with the critical section, or some other suitably unique value.
1507@param hint the lock hint.
1508
1509Enter code protected by a `critical` construct with a hint. The hint value is
1510used to suggest a lock implementation. This function blocks until the executing
1511thread can enter the critical section unless the hint suggests use of
1512speculative execution and the hardware supports it.
1513*/
1514void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1515                               kmp_critical_name *crit, uint32_t hint) {
1516  KMP_COUNT_BLOCK(OMP_CRITICAL);
1517  kmp_user_lock_p lck;
1518#if OMPT_SUPPORT && OMPT_OPTIONAL
1519  ompt_state_t prev_state = ompt_state_undefined;
1520  ompt_thread_info_t ti;
1521  // This is the case, if called from __kmpc_critical:
1522  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1523  if (!codeptr)
1524    codeptr = OMPT_GET_RETURN_ADDRESS(0);
1525#endif
1526
1527  KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1528  __kmp_assert_valid_gtid(global_tid);
1529
1530  kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1531  // Check if it is initialized.
1532  KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1533  kmp_dyna_lockseq_t lockseq = __kmp_map_hint_to_lock(hint);
1534  if (*lk == 0) {
1535    if (KMP_IS_D_LOCK(lockseq)) {
1536      KMP_COMPARE_AND_STORE_ACQ32(
1537          (volatile kmp_int32 *)&((kmp_base_tas_lock_t *)crit)->poll, 0,
1538          KMP_GET_D_TAG(lockseq));
1539    } else {
1540      __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lockseq));
1541    }
1542  }
1543  // Branch for accessing the actual lock object and set operation. This
1544  // branching is inevitable since this lock initialization does not follow the
1545  // normal dispatch path (lock table is not used).
1546  if (KMP_EXTRACT_D_TAG(lk) != 0) {
1547    lck = (kmp_user_lock_p)lk;
1548    if (__kmp_env_consistency_check) {
1549      __kmp_push_sync(global_tid, ct_critical, loc, lck,
1550                      __kmp_map_hint_to_lock(hint));
1551    }
1552#if USE_ITT_BUILD
1553    __kmp_itt_critical_acquiring(lck);
1554#endif
1555#if OMPT_SUPPORT && OMPT_OPTIONAL
1556    if (ompt_enabled.enabled) {
1557      ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1558      /* OMPT state update */
1559      prev_state = ti.state;
1560      ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1561      ti.state = ompt_state_wait_critical;
1562
1563      /* OMPT event callback */
1564      if (ompt_enabled.ompt_callback_mutex_acquire) {
1565        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1566            ompt_mutex_critical, (unsigned int)hint,
1567            __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1568            codeptr);
1569      }
1570    }
1571#endif
1572#if KMP_USE_INLINED_TAS
1573    if (lockseq == lockseq_tas && !__kmp_env_consistency_check) {
1574      KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1575    } else
1576#elif KMP_USE_INLINED_FUTEX
1577    if (lockseq == lockseq_futex && !__kmp_env_consistency_check) {
1578      KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1579    } else
1580#endif
1581    {
1582      KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1583    }
1584  } else {
1585    kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1586    lck = ilk->lock;
1587    if (__kmp_env_consistency_check) {
1588      __kmp_push_sync(global_tid, ct_critical, loc, lck,
1589                      __kmp_map_hint_to_lock(hint));
1590    }
1591#if USE_ITT_BUILD
1592    __kmp_itt_critical_acquiring(lck);
1593#endif
1594#if OMPT_SUPPORT && OMPT_OPTIONAL
1595    if (ompt_enabled.enabled) {
1596      ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1597      /* OMPT state update */
1598      prev_state = ti.state;
1599      ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1600      ti.state = ompt_state_wait_critical;
1601
1602      /* OMPT event callback */
1603      if (ompt_enabled.ompt_callback_mutex_acquire) {
1604        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1605            ompt_mutex_critical, (unsigned int)hint,
1606            __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1607            codeptr);
1608      }
1609    }
1610#endif
1611    KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1612  }
1613  KMP_POP_PARTITIONED_TIMER();
1614
1615#if USE_ITT_BUILD
1616  __kmp_itt_critical_acquired(lck);
1617#endif /* USE_ITT_BUILD */
1618#if OMPT_SUPPORT && OMPT_OPTIONAL
1619  if (ompt_enabled.enabled) {
1620    /* OMPT state update */
1621    ti.state = prev_state;
1622    ti.wait_id = 0;
1623
1624    /* OMPT event callback */
1625    if (ompt_enabled.ompt_callback_mutex_acquired) {
1626      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1627          ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1628    }
1629  }
1630#endif
1631
1632  KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1633  KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1634} // __kmpc_critical_with_hint
1635
1636#endif // KMP_USE_DYNAMIC_LOCK
1637
1638/*!
1639@ingroup WORK_SHARING
1640@param loc  source location information.
1641@param global_tid  global thread number .
1642@param crit identity of the critical section. This could be a pointer to a lock
1643associated with the critical section, or some other suitably unique value.
1644
1645Leave a critical section, releasing any lock that was held during its execution.
1646*/
1647void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1648                         kmp_critical_name *crit) {
1649  kmp_user_lock_p lck;
1650
1651  KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1652
1653#if KMP_USE_DYNAMIC_LOCK
1654  int locktag = KMP_EXTRACT_D_TAG(crit);
1655  if (locktag) {
1656    lck = (kmp_user_lock_p)crit;
1657    KMP_ASSERT(lck != NULL);
1658    if (__kmp_env_consistency_check) {
1659      __kmp_pop_sync(global_tid, ct_critical, loc);
1660    }
1661#if USE_ITT_BUILD
1662    __kmp_itt_critical_releasing(lck);
1663#endif
1664#if KMP_USE_INLINED_TAS
1665    if (locktag == locktag_tas && !__kmp_env_consistency_check) {
1666      KMP_RELEASE_TAS_LOCK(lck, global_tid);
1667    } else
1668#elif KMP_USE_INLINED_FUTEX
1669    if (locktag == locktag_futex && !__kmp_env_consistency_check) {
1670      KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1671    } else
1672#endif
1673    {
1674      KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1675    }
1676  } else {
1677    kmp_indirect_lock_t *ilk =
1678        (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1679    KMP_ASSERT(ilk != NULL);
1680    lck = ilk->lock;
1681    if (__kmp_env_consistency_check) {
1682      __kmp_pop_sync(global_tid, ct_critical, loc);
1683    }
1684#if USE_ITT_BUILD
1685    __kmp_itt_critical_releasing(lck);
1686#endif
1687    KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1688  }
1689
1690#else // KMP_USE_DYNAMIC_LOCK
1691
1692  if ((__kmp_user_lock_kind == lk_tas) &&
1693      (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1694    lck = (kmp_user_lock_p)crit;
1695  }
1696#if KMP_USE_FUTEX
1697  else if ((__kmp_user_lock_kind == lk_futex) &&
1698           (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1699    lck = (kmp_user_lock_p)crit;
1700  }
1701#endif
1702  else { // ticket, queuing or drdpa
1703    lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1704  }
1705
1706  KMP_ASSERT(lck != NULL);
1707
1708  if (__kmp_env_consistency_check)
1709    __kmp_pop_sync(global_tid, ct_critical, loc);
1710
1711#if USE_ITT_BUILD
1712  __kmp_itt_critical_releasing(lck);
1713#endif /* USE_ITT_BUILD */
1714  // Value of 'crit' should be good for using as a critical_id of the critical
1715  // section directive.
1716  __kmp_release_user_lock_with_checks(lck, global_tid);
1717
1718#endif // KMP_USE_DYNAMIC_LOCK
1719
1720#if OMPT_SUPPORT && OMPT_OPTIONAL
1721  /* OMPT release event triggers after lock is released; place here to trigger
1722   * for all #if branches */
1723  OMPT_STORE_RETURN_ADDRESS(global_tid);
1724  if (ompt_enabled.ompt_callback_mutex_released) {
1725    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1726        ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1727        OMPT_LOAD_RETURN_ADDRESS(0));
1728  }
1729#endif
1730
1731  KMP_POP_PARTITIONED_TIMER();
1732  KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1733}
1734
1735/*!
1736@ingroup SYNCHRONIZATION
1737@param loc source location information
1738@param global_tid thread id.
1739@return one if the thread should execute the master block, zero otherwise
1740
1741Start execution of a combined barrier and master. The barrier is executed inside
1742this function.
1743*/
1744kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1745  int status;
1746  KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1747  __kmp_assert_valid_gtid(global_tid);
1748
1749  if (!TCR_4(__kmp_init_parallel))
1750    __kmp_parallel_initialize();
1751
1752  __kmp_resume_if_soft_paused();
1753
1754  if (__kmp_env_consistency_check)
1755    __kmp_check_barrier(global_tid, ct_barrier, loc);
1756
1757#if OMPT_SUPPORT
1758  ompt_frame_t *ompt_frame;
1759  if (ompt_enabled.enabled) {
1760    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1761    if (ompt_frame->enter_frame.ptr == NULL)
1762      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1763  }
1764  OMPT_STORE_RETURN_ADDRESS(global_tid);
1765#endif
1766#if USE_ITT_NOTIFY
1767  __kmp_threads[global_tid]->th.th_ident = loc;
1768#endif
1769  status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1770#if OMPT_SUPPORT && OMPT_OPTIONAL
1771  if (ompt_enabled.enabled) {
1772    ompt_frame->enter_frame = ompt_data_none;
1773  }
1774#endif
1775
1776  return (status != 0) ? 0 : 1;
1777}
1778
1779/*!
1780@ingroup SYNCHRONIZATION
1781@param loc source location information
1782@param global_tid thread id.
1783
1784Complete the execution of a combined barrier and master. This function should
1785only be called at the completion of the <tt>master</tt> code. Other threads will
1786still be waiting at the barrier and this call releases them.
1787*/
1788void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1789  KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1790  __kmp_assert_valid_gtid(global_tid);
1791  __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1792}
1793
1794/*!
1795@ingroup SYNCHRONIZATION
1796@param loc source location information
1797@param global_tid thread id.
1798@return one if the thread should execute the master block, zero otherwise
1799
1800Start execution of a combined barrier and master(nowait) construct.
1801The barrier is executed inside this function.
1802There is no equivalent "end" function, since the
1803*/
1804kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1805  kmp_int32 ret;
1806  KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1807  __kmp_assert_valid_gtid(global_tid);
1808
1809  if (!TCR_4(__kmp_init_parallel))
1810    __kmp_parallel_initialize();
1811
1812  __kmp_resume_if_soft_paused();
1813
1814  if (__kmp_env_consistency_check) {
1815    if (loc == 0) {
1816      KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1817    }
1818    __kmp_check_barrier(global_tid, ct_barrier, loc);
1819  }
1820
1821#if OMPT_SUPPORT
1822  ompt_frame_t *ompt_frame;
1823  if (ompt_enabled.enabled) {
1824    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1825    if (ompt_frame->enter_frame.ptr == NULL)
1826      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1827  }
1828  OMPT_STORE_RETURN_ADDRESS(global_tid);
1829#endif
1830#if USE_ITT_NOTIFY
1831  __kmp_threads[global_tid]->th.th_ident = loc;
1832#endif
1833  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1834#if OMPT_SUPPORT && OMPT_OPTIONAL
1835  if (ompt_enabled.enabled) {
1836    ompt_frame->enter_frame = ompt_data_none;
1837  }
1838#endif
1839
1840  ret = __kmpc_master(loc, global_tid);
1841
1842  if (__kmp_env_consistency_check) {
1843    /*  there's no __kmpc_end_master called; so the (stats) */
1844    /*  actions of __kmpc_end_master are done here          */
1845    if (ret) {
1846      /* only one thread should do the pop since only */
1847      /* one did the push (see __kmpc_master())       */
1848      __kmp_pop_sync(global_tid, ct_master, loc);
1849    }
1850  }
1851
1852  return (ret);
1853}
1854
1855/* The BARRIER for a SINGLE process section is always explicit   */
1856/*!
1857@ingroup WORK_SHARING
1858@param loc  source location information
1859@param global_tid  global thread number
1860@return One if this thread should execute the single construct, zero otherwise.
1861
1862Test whether to execute a <tt>single</tt> construct.
1863There are no implicit barriers in the two "single" calls, rather the compiler
1864should introduce an explicit barrier if it is required.
1865*/
1866
1867kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1868  __kmp_assert_valid_gtid(global_tid);
1869  kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1870
1871  if (rc) {
1872    // We are going to execute the single statement, so we should count it.
1873    KMP_COUNT_BLOCK(OMP_SINGLE);
1874    KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1875  }
1876
1877#if OMPT_SUPPORT && OMPT_OPTIONAL
1878  kmp_info_t *this_thr = __kmp_threads[global_tid];
1879  kmp_team_t *team = this_thr->th.th_team;
1880  int tid = __kmp_tid_from_gtid(global_tid);
1881
1882  if (ompt_enabled.enabled) {
1883    if (rc) {
1884      if (ompt_enabled.ompt_callback_work) {
1885        ompt_callbacks.ompt_callback(ompt_callback_work)(
1886            ompt_work_single_executor, ompt_scope_begin,
1887            &(team->t.ompt_team_info.parallel_data),
1888            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1889            1, OMPT_GET_RETURN_ADDRESS(0));
1890      }
1891    } else {
1892      if (ompt_enabled.ompt_callback_work) {
1893        ompt_callbacks.ompt_callback(ompt_callback_work)(
1894            ompt_work_single_other, ompt_scope_begin,
1895            &(team->t.ompt_team_info.parallel_data),
1896            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1897            1, OMPT_GET_RETURN_ADDRESS(0));
1898        ompt_callbacks.ompt_callback(ompt_callback_work)(
1899            ompt_work_single_other, ompt_scope_end,
1900            &(team->t.ompt_team_info.parallel_data),
1901            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1902            1, OMPT_GET_RETURN_ADDRESS(0));
1903      }
1904    }
1905  }
1906#endif
1907
1908  return rc;
1909}
1910
1911/*!
1912@ingroup WORK_SHARING
1913@param loc  source location information
1914@param global_tid  global thread number
1915
1916Mark the end of a <tt>single</tt> construct.  This function should
1917only be called by the thread that executed the block of code protected
1918by the `single` construct.
1919*/
1920void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1921  __kmp_assert_valid_gtid(global_tid);
1922  __kmp_exit_single(global_tid);
1923  KMP_POP_PARTITIONED_TIMER();
1924
1925#if OMPT_SUPPORT && OMPT_OPTIONAL
1926  kmp_info_t *this_thr = __kmp_threads[global_tid];
1927  kmp_team_t *team = this_thr->th.th_team;
1928  int tid = __kmp_tid_from_gtid(global_tid);
1929
1930  if (ompt_enabled.ompt_callback_work) {
1931    ompt_callbacks.ompt_callback(ompt_callback_work)(
1932        ompt_work_single_executor, ompt_scope_end,
1933        &(team->t.ompt_team_info.parallel_data),
1934        &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1935        OMPT_GET_RETURN_ADDRESS(0));
1936  }
1937#endif
1938}
1939
1940/*!
1941@ingroup WORK_SHARING
1942@param loc Source location
1943@param global_tid Global thread id
1944
1945Mark the end of a statically scheduled loop.
1946*/
1947void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1948  KMP_POP_PARTITIONED_TIMER();
1949  KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1950
1951#if OMPT_SUPPORT && OMPT_OPTIONAL
1952  if (ompt_enabled.ompt_callback_work) {
1953    ompt_work_t ompt_work_type = ompt_work_loop;
1954    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1955    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1956    // Determine workshare type
1957    if (loc != NULL) {
1958      if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1959        ompt_work_type = ompt_work_loop;
1960      } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1961        ompt_work_type = ompt_work_sections;
1962      } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1963        ompt_work_type = ompt_work_distribute;
1964      } else {
1965        // use default set above.
1966        // a warning about this case is provided in __kmpc_for_static_init
1967      }
1968      KMP_DEBUG_ASSERT(ompt_work_type);
1969    }
1970    ompt_callbacks.ompt_callback(ompt_callback_work)(
1971        ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1972        &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1973  }
1974#endif
1975  if (__kmp_env_consistency_check)
1976    __kmp_pop_workshare(global_tid, ct_pdo, loc);
1977}
1978
1979// User routines which take C-style arguments (call by value)
1980// different from the Fortran equivalent routines
1981
1982void ompc_set_num_threads(int arg) {
1983  // !!!!! TODO: check the per-task binding
1984  __kmp_set_num_threads(arg, __kmp_entry_gtid());
1985}
1986
1987void ompc_set_dynamic(int flag) {
1988  kmp_info_t *thread;
1989
1990  /* For the thread-private implementation of the internal controls */
1991  thread = __kmp_entry_thread();
1992
1993  __kmp_save_internal_controls(thread);
1994
1995  set__dynamic(thread, flag ? true : false);
1996}
1997
1998void ompc_set_nested(int flag) {
1999  kmp_info_t *thread;
2000
2001  /* For the thread-private internal controls implementation */
2002  thread = __kmp_entry_thread();
2003
2004  __kmp_save_internal_controls(thread);
2005
2006  set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
2007}
2008
2009void ompc_set_max_active_levels(int max_active_levels) {
2010  /* TO DO */
2011  /* we want per-task implementation of this internal control */
2012
2013  /* For the per-thread internal controls implementation */
2014  __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
2015}
2016
2017void ompc_set_schedule(omp_sched_t kind, int modifier) {
2018  // !!!!! TODO: check the per-task binding
2019  __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
2020}
2021
2022int ompc_get_ancestor_thread_num(int level) {
2023  return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
2024}
2025
2026int ompc_get_team_size(int level) {
2027  return __kmp_get_team_size(__kmp_entry_gtid(), level);
2028}
2029
2030/* OpenMP 5.0 Affinity Format API */
2031void KMP_EXPAND_NAME(ompc_set_affinity_format)(char const *format) {
2032  if (!__kmp_init_serial) {
2033    __kmp_serial_initialize();
2034  }
2035  __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
2036                         format, KMP_STRLEN(format) + 1);
2037}
2038
2039size_t KMP_EXPAND_NAME(ompc_get_affinity_format)(char *buffer, size_t size) {
2040  size_t format_size;
2041  if (!__kmp_init_serial) {
2042    __kmp_serial_initialize();
2043  }
2044  format_size = KMP_STRLEN(__kmp_affinity_format);
2045  if (buffer && size) {
2046    __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
2047                           format_size + 1);
2048  }
2049  return format_size;
2050}
2051
2052void KMP_EXPAND_NAME(ompc_display_affinity)(char const *format) {
2053  int gtid;
2054  if (!TCR_4(__kmp_init_middle)) {
2055    __kmp_middle_initialize();
2056  }
2057  __kmp_assign_root_init_mask();
2058  gtid = __kmp_get_gtid();
2059#if KMP_AFFINITY_SUPPORTED
2060  if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 &&
2061      __kmp_affinity.flags.reset) {
2062    __kmp_reset_root_init_mask(gtid);
2063  }
2064#endif
2065  __kmp_aux_display_affinity(gtid, format);
2066}
2067
2068size_t KMP_EXPAND_NAME(ompc_capture_affinity)(char *buffer, size_t buf_size,
2069                                              char const *format) {
2070  int gtid;
2071  size_t num_required;
2072  kmp_str_buf_t capture_buf;
2073  if (!TCR_4(__kmp_init_middle)) {
2074    __kmp_middle_initialize();
2075  }
2076  __kmp_assign_root_init_mask();
2077  gtid = __kmp_get_gtid();
2078#if KMP_AFFINITY_SUPPORTED
2079  if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 &&
2080      __kmp_affinity.flags.reset) {
2081    __kmp_reset_root_init_mask(gtid);
2082  }
2083#endif
2084  __kmp_str_buf_init(&capture_buf);
2085  num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
2086  if (buffer && buf_size) {
2087    __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
2088                           capture_buf.used + 1);
2089  }
2090  __kmp_str_buf_free(&capture_buf);
2091  return num_required;
2092}
2093
2094void kmpc_set_stacksize(int arg) {
2095  // __kmp_aux_set_stacksize initializes the library if needed
2096  __kmp_aux_set_stacksize(arg);
2097}
2098
2099void kmpc_set_stacksize_s(size_t arg) {
2100  // __kmp_aux_set_stacksize initializes the library if needed
2101  __kmp_aux_set_stacksize(arg);
2102}
2103
2104void kmpc_set_blocktime(int arg) {
2105  int gtid, tid, bt = arg;
2106  kmp_info_t *thread;
2107
2108  gtid = __kmp_entry_gtid();
2109  tid = __kmp_tid_from_gtid(gtid);
2110  thread = __kmp_thread_from_gtid(gtid);
2111
2112  __kmp_aux_convert_blocktime(&bt);
2113  __kmp_aux_set_blocktime(bt, thread, tid);
2114}
2115
2116void kmpc_set_library(int arg) {
2117  // __kmp_user_set_library initializes the library if needed
2118  __kmp_user_set_library((enum library_type)arg);
2119}
2120
2121void kmpc_set_defaults(char const *str) {
2122  // __kmp_aux_set_defaults initializes the library if needed
2123  __kmp_aux_set_defaults(str, KMP_STRLEN(str));
2124}
2125
2126void kmpc_set_disp_num_buffers(int arg) {
2127  // ignore after initialization because some teams have already
2128  // allocated dispatch buffers
2129  if (__kmp_init_serial == FALSE && arg >= KMP_MIN_DISP_NUM_BUFF &&
2130      arg <= KMP_MAX_DISP_NUM_BUFF) {
2131    __kmp_dispatch_num_buffers = arg;
2132  }
2133}
2134
2135int kmpc_set_affinity_mask_proc(int proc, void **mask) {
2136#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2137  return -1;
2138#else
2139  if (!TCR_4(__kmp_init_middle)) {
2140    __kmp_middle_initialize();
2141  }
2142  __kmp_assign_root_init_mask();
2143  return __kmp_aux_set_affinity_mask_proc(proc, mask);
2144#endif
2145}
2146
2147int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2148#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2149  return -1;
2150#else
2151  if (!TCR_4(__kmp_init_middle)) {
2152    __kmp_middle_initialize();
2153  }
2154  __kmp_assign_root_init_mask();
2155  return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2156#endif
2157}
2158
2159int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2160#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2161  return -1;
2162#else
2163  if (!TCR_4(__kmp_init_middle)) {
2164    __kmp_middle_initialize();
2165  }
2166  __kmp_assign_root_init_mask();
2167  return __kmp_aux_get_affinity_mask_proc(proc, mask);
2168#endif
2169}
2170
2171/* -------------------------------------------------------------------------- */
2172/*!
2173@ingroup THREADPRIVATE
2174@param loc       source location information
2175@param gtid      global thread number
2176@param cpy_size  size of the cpy_data buffer
2177@param cpy_data  pointer to data to be copied
2178@param cpy_func  helper function to call for copying data
2179@param didit     flag variable: 1=single thread; 0=not single thread
2180
2181__kmpc_copyprivate implements the interface for the private data broadcast
2182needed for the copyprivate clause associated with a single region in an
2183OpenMP<sup>*</sup> program (both C and Fortran).
2184All threads participating in the parallel region call this routine.
2185One of the threads (called the single thread) should have the <tt>didit</tt>
2186variable set to 1 and all other threads should have that variable set to 0.
2187All threads pass a pointer to a data buffer (cpy_data) that they have built.
2188
2189The OpenMP specification forbids the use of nowait on the single region when a
2190copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2191barrier internally to avoid race conditions, so the code generation for the
2192single region should avoid generating a barrier after the call to @ref
2193__kmpc_copyprivate.
2194
2195The <tt>gtid</tt> parameter is the global thread id for the current thread.
2196The <tt>loc</tt> parameter is a pointer to source location information.
2197
2198Internal implementation: The single thread will first copy its descriptor
2199address (cpy_data) to a team-private location, then the other threads will each
2200call the function pointed to by the parameter cpy_func, which carries out the
2201copy by copying the data using the cpy_data buffer.
2202
2203The cpy_func routine used for the copy and the contents of the data area defined
2204by cpy_data and cpy_size may be built in any fashion that will allow the copy
2205to be done. For instance, the cpy_data buffer can hold the actual data to be
2206copied or it may hold a list of pointers to the data. The cpy_func routine must
2207interpret the cpy_data buffer appropriately.
2208
2209The interface to cpy_func is as follows:
2210@code
2211void cpy_func( void *destination, void *source )
2212@endcode
2213where void *destination is the cpy_data pointer for the thread being copied to
2214and void *source is the cpy_data pointer for the thread being copied from.
2215*/
2216void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2217                        void *cpy_data, void (*cpy_func)(void *, void *),
2218                        kmp_int32 didit) {
2219  void **data_ptr;
2220  KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2221  __kmp_assert_valid_gtid(gtid);
2222
2223  KMP_MB();
2224
2225  data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2226
2227  if (__kmp_env_consistency_check) {
2228    if (loc == 0) {
2229      KMP_WARNING(ConstructIdentInvalid);
2230    }
2231  }
2232
2233  // ToDo: Optimize the following two barriers into some kind of split barrier
2234
2235  if (didit)
2236    *data_ptr = cpy_data;
2237
2238#if OMPT_SUPPORT
2239  ompt_frame_t *ompt_frame;
2240  if (ompt_enabled.enabled) {
2241    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2242    if (ompt_frame->enter_frame.ptr == NULL)
2243      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2244  }
2245  OMPT_STORE_RETURN_ADDRESS(gtid);
2246#endif
2247/* This barrier is not a barrier region boundary */
2248#if USE_ITT_NOTIFY
2249  __kmp_threads[gtid]->th.th_ident = loc;
2250#endif
2251  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2252
2253  if (!didit)
2254    (*cpy_func)(cpy_data, *data_ptr);
2255
2256  // Consider next barrier a user-visible barrier for barrier region boundaries
2257  // Nesting checks are already handled by the single construct checks
2258  {
2259#if OMPT_SUPPORT
2260    OMPT_STORE_RETURN_ADDRESS(gtid);
2261#endif
2262#if USE_ITT_NOTIFY
2263    __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2264// tasks can overwrite the location)
2265#endif
2266    __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2267#if OMPT_SUPPORT && OMPT_OPTIONAL
2268    if (ompt_enabled.enabled) {
2269      ompt_frame->enter_frame = ompt_data_none;
2270    }
2271#endif
2272  }
2273}
2274
2275/* --------------------------------------------------------------------------*/
2276/*!
2277@ingroup THREADPRIVATE
2278@param loc       source location information
2279@param gtid      global thread number
2280@param cpy_data  pointer to the data to be saved/copied or 0
2281@return          the saved pointer to the data
2282
2283__kmpc_copyprivate_light is a lighter version of __kmpc_copyprivate:
2284__kmpc_copyprivate_light only saves the pointer it's given (if it's not 0, so
2285coming from single), and returns that pointer in all calls (for single thread
2286it's not needed). This version doesn't do any actual data copying. Data copying
2287has to be done somewhere else, e.g. inline in the generated code. Due to this,
2288this function doesn't have any barrier at the end of the function, like
2289__kmpc_copyprivate does, so generated code needs barrier after copying of all
2290data was done.
2291*/
2292void *__kmpc_copyprivate_light(ident_t *loc, kmp_int32 gtid, void *cpy_data) {
2293  void **data_ptr;
2294
2295  KC_TRACE(10, ("__kmpc_copyprivate_light: called T#%d\n", gtid));
2296
2297  KMP_MB();
2298
2299  data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2300
2301  if (__kmp_env_consistency_check) {
2302    if (loc == 0) {
2303      KMP_WARNING(ConstructIdentInvalid);
2304    }
2305  }
2306
2307  // ToDo: Optimize the following barrier
2308
2309  if (cpy_data)
2310    *data_ptr = cpy_data;
2311
2312#if OMPT_SUPPORT
2313  ompt_frame_t *ompt_frame;
2314  if (ompt_enabled.enabled) {
2315    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2316    if (ompt_frame->enter_frame.ptr == NULL)
2317      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2318    OMPT_STORE_RETURN_ADDRESS(gtid);
2319  }
2320#endif
2321/* This barrier is not a barrier region boundary */
2322#if USE_ITT_NOTIFY
2323  __kmp_threads[gtid]->th.th_ident = loc;
2324#endif
2325  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2326
2327  return *data_ptr;
2328}
2329
2330/* -------------------------------------------------------------------------- */
2331
2332#define INIT_LOCK __kmp_init_user_lock_with_checks
2333#define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2334#define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2335#define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2336#define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2337#define ACQUIRE_NESTED_LOCK_TIMED                                              \
2338  __kmp_acquire_nested_user_lock_with_checks_timed
2339#define RELEASE_LOCK __kmp_release_user_lock_with_checks
2340#define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2341#define TEST_LOCK __kmp_test_user_lock_with_checks
2342#define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2343#define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2344#define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2345
2346// TODO: Make check abort messages use location info & pass it into
2347// with_checks routines
2348
2349#if KMP_USE_DYNAMIC_LOCK
2350
2351// internal lock initializer
2352static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2353                                                    kmp_dyna_lockseq_t seq) {
2354  if (KMP_IS_D_LOCK(seq)) {
2355    KMP_INIT_D_LOCK(lock, seq);
2356#if USE_ITT_BUILD
2357    __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2358#endif
2359  } else {
2360    KMP_INIT_I_LOCK(lock, seq);
2361#if USE_ITT_BUILD
2362    kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2363    __kmp_itt_lock_creating(ilk->lock, loc);
2364#endif
2365  }
2366}
2367
2368// internal nest lock initializer
2369static __forceinline void
2370__kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2371                               kmp_dyna_lockseq_t seq) {
2372#if KMP_USE_TSX
2373  // Don't have nested lock implementation for speculative locks
2374  if (seq == lockseq_hle || seq == lockseq_rtm_queuing ||
2375      seq == lockseq_rtm_spin || seq == lockseq_adaptive)
2376    seq = __kmp_user_lock_seq;
2377#endif
2378  switch (seq) {
2379  case lockseq_tas:
2380    seq = lockseq_nested_tas;
2381    break;
2382#if KMP_USE_FUTEX
2383  case lockseq_futex:
2384    seq = lockseq_nested_futex;
2385    break;
2386#endif
2387  case lockseq_ticket:
2388    seq = lockseq_nested_ticket;
2389    break;
2390  case lockseq_queuing:
2391    seq = lockseq_nested_queuing;
2392    break;
2393  case lockseq_drdpa:
2394    seq = lockseq_nested_drdpa;
2395    break;
2396  default:
2397    seq = lockseq_nested_queuing;
2398  }
2399  KMP_INIT_I_LOCK(lock, seq);
2400#if USE_ITT_BUILD
2401  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2402  __kmp_itt_lock_creating(ilk->lock, loc);
2403#endif
2404}
2405
2406/* initialize the lock with a hint */
2407void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2408                                uintptr_t hint) {
2409  KMP_DEBUG_ASSERT(__kmp_init_serial);
2410  if (__kmp_env_consistency_check && user_lock == NULL) {
2411    KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2412  }
2413
2414  __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2415
2416#if OMPT_SUPPORT && OMPT_OPTIONAL
2417  // This is the case, if called from omp_init_lock_with_hint:
2418  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2419  if (!codeptr)
2420    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2421  if (ompt_enabled.ompt_callback_lock_init) {
2422    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2423        ompt_mutex_lock, (omp_lock_hint_t)hint,
2424        __ompt_get_mutex_impl_type(user_lock),
2425        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2426  }
2427#endif
2428}
2429
2430/* initialize the lock with a hint */
2431void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2432                                     void **user_lock, uintptr_t hint) {
2433  KMP_DEBUG_ASSERT(__kmp_init_serial);
2434  if (__kmp_env_consistency_check && user_lock == NULL) {
2435    KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2436  }
2437
2438  __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2439
2440#if OMPT_SUPPORT && OMPT_OPTIONAL
2441  // This is the case, if called from omp_init_lock_with_hint:
2442  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2443  if (!codeptr)
2444    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2445  if (ompt_enabled.ompt_callback_lock_init) {
2446    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2447        ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2448        __ompt_get_mutex_impl_type(user_lock),
2449        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2450  }
2451#endif
2452}
2453
2454#endif // KMP_USE_DYNAMIC_LOCK
2455
2456/* initialize the lock */
2457void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2458#if KMP_USE_DYNAMIC_LOCK
2459
2460  KMP_DEBUG_ASSERT(__kmp_init_serial);
2461  if (__kmp_env_consistency_check && user_lock == NULL) {
2462    KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2463  }
2464  __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2465
2466#if OMPT_SUPPORT && OMPT_OPTIONAL
2467  // This is the case, if called from omp_init_lock_with_hint:
2468  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2469  if (!codeptr)
2470    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2471  if (ompt_enabled.ompt_callback_lock_init) {
2472    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2473        ompt_mutex_lock, omp_lock_hint_none,
2474        __ompt_get_mutex_impl_type(user_lock),
2475        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2476  }
2477#endif
2478
2479#else // KMP_USE_DYNAMIC_LOCK
2480
2481  static char const *const func = "omp_init_lock";
2482  kmp_user_lock_p lck;
2483  KMP_DEBUG_ASSERT(__kmp_init_serial);
2484
2485  if (__kmp_env_consistency_check) {
2486    if (user_lock == NULL) {
2487      KMP_FATAL(LockIsUninitialized, func);
2488    }
2489  }
2490
2491  KMP_CHECK_USER_LOCK_INIT();
2492
2493  if ((__kmp_user_lock_kind == lk_tas) &&
2494      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2495    lck = (kmp_user_lock_p)user_lock;
2496  }
2497#if KMP_USE_FUTEX
2498  else if ((__kmp_user_lock_kind == lk_futex) &&
2499           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2500    lck = (kmp_user_lock_p)user_lock;
2501  }
2502#endif
2503  else {
2504    lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2505  }
2506  INIT_LOCK(lck);
2507  __kmp_set_user_lock_location(lck, loc);
2508
2509#if OMPT_SUPPORT && OMPT_OPTIONAL
2510  // This is the case, if called from omp_init_lock_with_hint:
2511  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2512  if (!codeptr)
2513    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2514  if (ompt_enabled.ompt_callback_lock_init) {
2515    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2516        ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2517        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2518  }
2519#endif
2520
2521#if USE_ITT_BUILD
2522  __kmp_itt_lock_creating(lck);
2523#endif /* USE_ITT_BUILD */
2524
2525#endif // KMP_USE_DYNAMIC_LOCK
2526} // __kmpc_init_lock
2527
2528/* initialize the lock */
2529void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2530#if KMP_USE_DYNAMIC_LOCK
2531
2532  KMP_DEBUG_ASSERT(__kmp_init_serial);
2533  if (__kmp_env_consistency_check && user_lock == NULL) {
2534    KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2535  }
2536  __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2537
2538#if OMPT_SUPPORT && OMPT_OPTIONAL
2539  // This is the case, if called from omp_init_lock_with_hint:
2540  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2541  if (!codeptr)
2542    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2543  if (ompt_enabled.ompt_callback_lock_init) {
2544    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2545        ompt_mutex_nest_lock, omp_lock_hint_none,
2546        __ompt_get_mutex_impl_type(user_lock),
2547        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2548  }
2549#endif
2550
2551#else // KMP_USE_DYNAMIC_LOCK
2552
2553  static char const *const func = "omp_init_nest_lock";
2554  kmp_user_lock_p lck;
2555  KMP_DEBUG_ASSERT(__kmp_init_serial);
2556
2557  if (__kmp_env_consistency_check) {
2558    if (user_lock == NULL) {
2559      KMP_FATAL(LockIsUninitialized, func);
2560    }
2561  }
2562
2563  KMP_CHECK_USER_LOCK_INIT();
2564
2565  if ((__kmp_user_lock_kind == lk_tas) &&
2566      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2567       OMP_NEST_LOCK_T_SIZE)) {
2568    lck = (kmp_user_lock_p)user_lock;
2569  }
2570#if KMP_USE_FUTEX
2571  else if ((__kmp_user_lock_kind == lk_futex) &&
2572           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2573            OMP_NEST_LOCK_T_SIZE)) {
2574    lck = (kmp_user_lock_p)user_lock;
2575  }
2576#endif
2577  else {
2578    lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2579  }
2580
2581  INIT_NESTED_LOCK(lck);
2582  __kmp_set_user_lock_location(lck, loc);
2583
2584#if OMPT_SUPPORT && OMPT_OPTIONAL
2585  // This is the case, if called from omp_init_lock_with_hint:
2586  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2587  if (!codeptr)
2588    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2589  if (ompt_enabled.ompt_callback_lock_init) {
2590    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2591        ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2592        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2593  }
2594#endif
2595
2596#if USE_ITT_BUILD
2597  __kmp_itt_lock_creating(lck);
2598#endif /* USE_ITT_BUILD */
2599
2600#endif // KMP_USE_DYNAMIC_LOCK
2601} // __kmpc_init_nest_lock
2602
2603void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2604#if KMP_USE_DYNAMIC_LOCK
2605
2606#if USE_ITT_BUILD
2607  kmp_user_lock_p lck;
2608  if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2609    lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2610  } else {
2611    lck = (kmp_user_lock_p)user_lock;
2612  }
2613  __kmp_itt_lock_destroyed(lck);
2614#endif
2615#if OMPT_SUPPORT && OMPT_OPTIONAL
2616  // This is the case, if called from omp_init_lock_with_hint:
2617  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2618  if (!codeptr)
2619    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2620  if (ompt_enabled.ompt_callback_lock_destroy) {
2621    ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2622        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2623  }
2624#endif
2625  KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2626#else
2627  kmp_user_lock_p lck;
2628
2629  if ((__kmp_user_lock_kind == lk_tas) &&
2630      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2631    lck = (kmp_user_lock_p)user_lock;
2632  }
2633#if KMP_USE_FUTEX
2634  else if ((__kmp_user_lock_kind == lk_futex) &&
2635           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2636    lck = (kmp_user_lock_p)user_lock;
2637  }
2638#endif
2639  else {
2640    lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2641  }
2642
2643#if OMPT_SUPPORT && OMPT_OPTIONAL
2644  // This is the case, if called from omp_init_lock_with_hint:
2645  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2646  if (!codeptr)
2647    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2648  if (ompt_enabled.ompt_callback_lock_destroy) {
2649    ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2650        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2651  }
2652#endif
2653
2654#if USE_ITT_BUILD
2655  __kmp_itt_lock_destroyed(lck);
2656#endif /* USE_ITT_BUILD */
2657  DESTROY_LOCK(lck);
2658
2659  if ((__kmp_user_lock_kind == lk_tas) &&
2660      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2661    ;
2662  }
2663#if KMP_USE_FUTEX
2664  else if ((__kmp_user_lock_kind == lk_futex) &&
2665           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2666    ;
2667  }
2668#endif
2669  else {
2670    __kmp_user_lock_free(user_lock, gtid, lck);
2671  }
2672#endif // KMP_USE_DYNAMIC_LOCK
2673} // __kmpc_destroy_lock
2674
2675/* destroy the lock */
2676void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2677#if KMP_USE_DYNAMIC_LOCK
2678
2679#if USE_ITT_BUILD
2680  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2681  __kmp_itt_lock_destroyed(ilk->lock);
2682#endif
2683#if OMPT_SUPPORT && OMPT_OPTIONAL
2684  // This is the case, if called from omp_init_lock_with_hint:
2685  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2686  if (!codeptr)
2687    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2688  if (ompt_enabled.ompt_callback_lock_destroy) {
2689    ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2690        ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2691  }
2692#endif
2693  KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2694
2695#else // KMP_USE_DYNAMIC_LOCK
2696
2697  kmp_user_lock_p lck;
2698
2699  if ((__kmp_user_lock_kind == lk_tas) &&
2700      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2701       OMP_NEST_LOCK_T_SIZE)) {
2702    lck = (kmp_user_lock_p)user_lock;
2703  }
2704#if KMP_USE_FUTEX
2705  else if ((__kmp_user_lock_kind == lk_futex) &&
2706           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2707            OMP_NEST_LOCK_T_SIZE)) {
2708    lck = (kmp_user_lock_p)user_lock;
2709  }
2710#endif
2711  else {
2712    lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2713  }
2714
2715#if OMPT_SUPPORT && OMPT_OPTIONAL
2716  // This is the case, if called from omp_init_lock_with_hint:
2717  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2718  if (!codeptr)
2719    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2720  if (ompt_enabled.ompt_callback_lock_destroy) {
2721    ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2722        ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2723  }
2724#endif
2725
2726#if USE_ITT_BUILD
2727  __kmp_itt_lock_destroyed(lck);
2728#endif /* USE_ITT_BUILD */
2729
2730  DESTROY_NESTED_LOCK(lck);
2731
2732  if ((__kmp_user_lock_kind == lk_tas) &&
2733      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2734       OMP_NEST_LOCK_T_SIZE)) {
2735    ;
2736  }
2737#if KMP_USE_FUTEX
2738  else if ((__kmp_user_lock_kind == lk_futex) &&
2739           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2740            OMP_NEST_LOCK_T_SIZE)) {
2741    ;
2742  }
2743#endif
2744  else {
2745    __kmp_user_lock_free(user_lock, gtid, lck);
2746  }
2747#endif // KMP_USE_DYNAMIC_LOCK
2748} // __kmpc_destroy_nest_lock
2749
2750void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2751  KMP_COUNT_BLOCK(OMP_set_lock);
2752#if KMP_USE_DYNAMIC_LOCK
2753  int tag = KMP_EXTRACT_D_TAG(user_lock);
2754#if USE_ITT_BUILD
2755  __kmp_itt_lock_acquiring(
2756      (kmp_user_lock_p)
2757          user_lock); // itt function will get to the right lock object.
2758#endif
2759#if OMPT_SUPPORT && OMPT_OPTIONAL
2760  // This is the case, if called from omp_init_lock_with_hint:
2761  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2762  if (!codeptr)
2763    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2764  if (ompt_enabled.ompt_callback_mutex_acquire) {
2765    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2766        ompt_mutex_lock, omp_lock_hint_none,
2767        __ompt_get_mutex_impl_type(user_lock),
2768        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2769  }
2770#endif
2771#if KMP_USE_INLINED_TAS
2772  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2773    KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2774  } else
2775#elif KMP_USE_INLINED_FUTEX
2776  if (tag == locktag_futex && !__kmp_env_consistency_check) {
2777    KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2778  } else
2779#endif
2780  {
2781    __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2782  }
2783#if USE_ITT_BUILD
2784  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2785#endif
2786#if OMPT_SUPPORT && OMPT_OPTIONAL
2787  if (ompt_enabled.ompt_callback_mutex_acquired) {
2788    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2789        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2790  }
2791#endif
2792
2793#else // KMP_USE_DYNAMIC_LOCK
2794
2795  kmp_user_lock_p lck;
2796
2797  if ((__kmp_user_lock_kind == lk_tas) &&
2798      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2799    lck = (kmp_user_lock_p)user_lock;
2800  }
2801#if KMP_USE_FUTEX
2802  else if ((__kmp_user_lock_kind == lk_futex) &&
2803           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2804    lck = (kmp_user_lock_p)user_lock;
2805  }
2806#endif
2807  else {
2808    lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2809  }
2810
2811#if USE_ITT_BUILD
2812  __kmp_itt_lock_acquiring(lck);
2813#endif /* USE_ITT_BUILD */
2814#if OMPT_SUPPORT && OMPT_OPTIONAL
2815  // This is the case, if called from omp_init_lock_with_hint:
2816  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2817  if (!codeptr)
2818    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2819  if (ompt_enabled.ompt_callback_mutex_acquire) {
2820    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2821        ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2822        (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2823  }
2824#endif
2825
2826  ACQUIRE_LOCK(lck, gtid);
2827
2828#if USE_ITT_BUILD
2829  __kmp_itt_lock_acquired(lck);
2830#endif /* USE_ITT_BUILD */
2831
2832#if OMPT_SUPPORT && OMPT_OPTIONAL
2833  if (ompt_enabled.ompt_callback_mutex_acquired) {
2834    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2835        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2836  }
2837#endif
2838
2839#endif // KMP_USE_DYNAMIC_LOCK
2840}
2841
2842void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2843#if KMP_USE_DYNAMIC_LOCK
2844
2845#if USE_ITT_BUILD
2846  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2847#endif
2848#if OMPT_SUPPORT && OMPT_OPTIONAL
2849  // This is the case, if called from omp_init_lock_with_hint:
2850  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2851  if (!codeptr)
2852    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2853  if (ompt_enabled.enabled) {
2854    if (ompt_enabled.ompt_callback_mutex_acquire) {
2855      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2856          ompt_mutex_nest_lock, omp_lock_hint_none,
2857          __ompt_get_mutex_impl_type(user_lock),
2858          (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2859    }
2860  }
2861#endif
2862  int acquire_status =
2863      KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2864  (void)acquire_status;
2865#if USE_ITT_BUILD
2866  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2867#endif
2868
2869#if OMPT_SUPPORT && OMPT_OPTIONAL
2870  if (ompt_enabled.enabled) {
2871    if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2872      if (ompt_enabled.ompt_callback_mutex_acquired) {
2873        // lock_first
2874        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2875            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2876            codeptr);
2877      }
2878    } else {
2879      if (ompt_enabled.ompt_callback_nest_lock) {
2880        // lock_next
2881        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2882            ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2883      }
2884    }
2885  }
2886#endif
2887
2888#else // KMP_USE_DYNAMIC_LOCK
2889  int acquire_status;
2890  kmp_user_lock_p lck;
2891
2892  if ((__kmp_user_lock_kind == lk_tas) &&
2893      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2894       OMP_NEST_LOCK_T_SIZE)) {
2895    lck = (kmp_user_lock_p)user_lock;
2896  }
2897#if KMP_USE_FUTEX
2898  else if ((__kmp_user_lock_kind == lk_futex) &&
2899           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2900            OMP_NEST_LOCK_T_SIZE)) {
2901    lck = (kmp_user_lock_p)user_lock;
2902  }
2903#endif
2904  else {
2905    lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2906  }
2907
2908#if USE_ITT_BUILD
2909  __kmp_itt_lock_acquiring(lck);
2910#endif /* USE_ITT_BUILD */
2911#if OMPT_SUPPORT && OMPT_OPTIONAL
2912  // This is the case, if called from omp_init_lock_with_hint:
2913  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2914  if (!codeptr)
2915    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2916  if (ompt_enabled.enabled) {
2917    if (ompt_enabled.ompt_callback_mutex_acquire) {
2918      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2919          ompt_mutex_nest_lock, omp_lock_hint_none,
2920          __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2921          codeptr);
2922    }
2923  }
2924#endif
2925
2926  ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2927
2928#if USE_ITT_BUILD
2929  __kmp_itt_lock_acquired(lck);
2930#endif /* USE_ITT_BUILD */
2931
2932#if OMPT_SUPPORT && OMPT_OPTIONAL
2933  if (ompt_enabled.enabled) {
2934    if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2935      if (ompt_enabled.ompt_callback_mutex_acquired) {
2936        // lock_first
2937        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2938            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2939      }
2940    } else {
2941      if (ompt_enabled.ompt_callback_nest_lock) {
2942        // lock_next
2943        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2944            ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2945      }
2946    }
2947  }
2948#endif
2949
2950#endif // KMP_USE_DYNAMIC_LOCK
2951}
2952
2953void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2954#if KMP_USE_DYNAMIC_LOCK
2955
2956  int tag = KMP_EXTRACT_D_TAG(user_lock);
2957#if USE_ITT_BUILD
2958  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2959#endif
2960#if KMP_USE_INLINED_TAS
2961  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2962    KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2963  } else
2964#elif KMP_USE_INLINED_FUTEX
2965  if (tag == locktag_futex && !__kmp_env_consistency_check) {
2966    KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2967  } else
2968#endif
2969  {
2970    __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2971  }
2972
2973#if OMPT_SUPPORT && OMPT_OPTIONAL
2974  // This is the case, if called from omp_init_lock_with_hint:
2975  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2976  if (!codeptr)
2977    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2978  if (ompt_enabled.ompt_callback_mutex_released) {
2979    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2980        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2981  }
2982#endif
2983
2984#else // KMP_USE_DYNAMIC_LOCK
2985
2986  kmp_user_lock_p lck;
2987
2988  /* Can't use serial interval since not block structured */
2989  /* release the lock */
2990
2991  if ((__kmp_user_lock_kind == lk_tas) &&
2992      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2993#if KMP_OS_LINUX &&                                                            \
2994    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2995// "fast" path implemented to fix customer performance issue
2996#if USE_ITT_BUILD
2997    __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2998#endif /* USE_ITT_BUILD */
2999    TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
3000    KMP_MB();
3001
3002#if OMPT_SUPPORT && OMPT_OPTIONAL
3003    // This is the case, if called from omp_init_lock_with_hint:
3004    void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3005    if (!codeptr)
3006      codeptr = OMPT_GET_RETURN_ADDRESS(0);
3007    if (ompt_enabled.ompt_callback_mutex_released) {
3008      ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3009          ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3010    }
3011#endif
3012
3013    return;
3014#else
3015    lck = (kmp_user_lock_p)user_lock;
3016#endif
3017  }
3018#if KMP_USE_FUTEX
3019  else if ((__kmp_user_lock_kind == lk_futex) &&
3020           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3021    lck = (kmp_user_lock_p)user_lock;
3022  }
3023#endif
3024  else {
3025    lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
3026  }
3027
3028#if USE_ITT_BUILD
3029  __kmp_itt_lock_releasing(lck);
3030#endif /* USE_ITT_BUILD */
3031
3032  RELEASE_LOCK(lck, gtid);
3033
3034#if OMPT_SUPPORT && OMPT_OPTIONAL
3035  // This is the case, if called from omp_init_lock_with_hint:
3036  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3037  if (!codeptr)
3038    codeptr = OMPT_GET_RETURN_ADDRESS(0);
3039  if (ompt_enabled.ompt_callback_mutex_released) {
3040    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3041        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3042  }
3043#endif
3044
3045#endif // KMP_USE_DYNAMIC_LOCK
3046}
3047
3048/* release the lock */
3049void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3050#if KMP_USE_DYNAMIC_LOCK
3051
3052#if USE_ITT_BUILD
3053  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
3054#endif
3055  int release_status =
3056      KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
3057  (void)release_status;
3058
3059#if OMPT_SUPPORT && OMPT_OPTIONAL
3060  // This is the case, if called from omp_init_lock_with_hint:
3061  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3062  if (!codeptr)
3063    codeptr = OMPT_GET_RETURN_ADDRESS(0);
3064  if (ompt_enabled.enabled) {
3065    if (release_status == KMP_LOCK_RELEASED) {
3066      if (ompt_enabled.ompt_callback_mutex_released) {
3067        // release_lock_last
3068        ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3069            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3070            codeptr);
3071      }
3072    } else if (ompt_enabled.ompt_callback_nest_lock) {
3073      // release_lock_prev
3074      ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3075          ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3076    }
3077  }
3078#endif
3079
3080#else // KMP_USE_DYNAMIC_LOCK
3081
3082  kmp_user_lock_p lck;
3083
3084  /* Can't use serial interval since not block structured */
3085
3086  if ((__kmp_user_lock_kind == lk_tas) &&
3087      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3088       OMP_NEST_LOCK_T_SIZE)) {
3089#if KMP_OS_LINUX &&                                                            \
3090    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
3091    // "fast" path implemented to fix customer performance issue
3092    kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
3093#if USE_ITT_BUILD
3094    __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
3095#endif /* USE_ITT_BUILD */
3096
3097#if OMPT_SUPPORT && OMPT_OPTIONAL
3098    int release_status = KMP_LOCK_STILL_HELD;
3099#endif
3100
3101    if (--(tl->lk.depth_locked) == 0) {
3102      TCW_4(tl->lk.poll, 0);
3103#if OMPT_SUPPORT && OMPT_OPTIONAL
3104      release_status = KMP_LOCK_RELEASED;
3105#endif
3106    }
3107    KMP_MB();
3108
3109#if OMPT_SUPPORT && OMPT_OPTIONAL
3110    // This is the case, if called from omp_init_lock_with_hint:
3111    void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3112    if (!codeptr)
3113      codeptr = OMPT_GET_RETURN_ADDRESS(0);
3114    if (ompt_enabled.enabled) {
3115      if (release_status == KMP_LOCK_RELEASED) {
3116        if (ompt_enabled.ompt_callback_mutex_released) {
3117          // release_lock_last
3118          ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3119              ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3120        }
3121      } else if (ompt_enabled.ompt_callback_nest_lock) {
3122        // release_lock_previous
3123        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3124            ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3125      }
3126    }
3127#endif
3128
3129    return;
3130#else
3131    lck = (kmp_user_lock_p)user_lock;
3132#endif
3133  }
3134#if KMP_USE_FUTEX
3135  else if ((__kmp_user_lock_kind == lk_futex) &&
3136           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3137            OMP_NEST_LOCK_T_SIZE)) {
3138    lck = (kmp_user_lock_p)user_lock;
3139  }
3140#endif
3141  else {
3142    lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
3143  }
3144
3145#if USE_ITT_BUILD
3146  __kmp_itt_lock_releasing(lck);
3147#endif /* USE_ITT_BUILD */
3148
3149  int release_status;
3150  release_status = RELEASE_NESTED_LOCK(lck, gtid);
3151#if OMPT_SUPPORT && OMPT_OPTIONAL
3152  // This is the case, if called from omp_init_lock_with_hint:
3153  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3154  if (!codeptr)
3155    codeptr = OMPT_GET_RETURN_ADDRESS(0);
3156  if (ompt_enabled.enabled) {
3157    if (release_status == KMP_LOCK_RELEASED) {
3158      if (ompt_enabled.ompt_callback_mutex_released) {
3159        // release_lock_last
3160        ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3161            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3162      }
3163    } else if (ompt_enabled.ompt_callback_nest_lock) {
3164      // release_lock_previous
3165      ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3166          ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3167    }
3168  }
3169#endif
3170
3171#endif // KMP_USE_DYNAMIC_LOCK
3172}
3173
3174/* try to acquire the lock */
3175int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3176  KMP_COUNT_BLOCK(OMP_test_lock);
3177
3178#if KMP_USE_DYNAMIC_LOCK
3179  int rc;
3180  int tag = KMP_EXTRACT_D_TAG(user_lock);
3181#if USE_ITT_BUILD
3182  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3183#endif
3184#if OMPT_SUPPORT && OMPT_OPTIONAL
3185  // This is the case, if called from omp_init_lock_with_hint:
3186  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3187  if (!codeptr)
3188    codeptr = OMPT_GET_RETURN_ADDRESS(0);
3189  if (ompt_enabled.ompt_callback_mutex_acquire) {
3190    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3191        ompt_mutex_test_lock, omp_lock_hint_none,
3192        __ompt_get_mutex_impl_type(user_lock),
3193        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3194  }
3195#endif
3196#if KMP_USE_INLINED_TAS
3197  if (tag == locktag_tas && !__kmp_env_consistency_check) {
3198    KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3199  } else
3200#elif KMP_USE_INLINED_FUTEX
3201  if (tag == locktag_futex && !__kmp_env_consistency_check) {
3202    KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3203  } else
3204#endif
3205  {
3206    rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3207  }
3208  if (rc) {
3209#if USE_ITT_BUILD
3210    __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3211#endif
3212#if OMPT_SUPPORT && OMPT_OPTIONAL
3213    if (ompt_enabled.ompt_callback_mutex_acquired) {
3214      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3215          ompt_mutex_test_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3216    }
3217#endif
3218    return FTN_TRUE;
3219  } else {
3220#if USE_ITT_BUILD
3221    __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3222#endif
3223    return FTN_FALSE;
3224  }
3225
3226#else // KMP_USE_DYNAMIC_LOCK
3227
3228  kmp_user_lock_p lck;
3229  int rc;
3230
3231  if ((__kmp_user_lock_kind == lk_tas) &&
3232      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3233    lck = (kmp_user_lock_p)user_lock;
3234  }
3235#if KMP_USE_FUTEX
3236  else if ((__kmp_user_lock_kind == lk_futex) &&
3237           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3238    lck = (kmp_user_lock_p)user_lock;
3239  }
3240#endif
3241  else {
3242    lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3243  }
3244
3245#if USE_ITT_BUILD
3246  __kmp_itt_lock_acquiring(lck);
3247#endif /* USE_ITT_BUILD */
3248#if OMPT_SUPPORT && OMPT_OPTIONAL
3249  // This is the case, if called from omp_init_lock_with_hint:
3250  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3251  if (!codeptr)
3252    codeptr = OMPT_GET_RETURN_ADDRESS(0);
3253  if (ompt_enabled.ompt_callback_mutex_acquire) {
3254    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3255        ompt_mutex_test_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3256        (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3257  }
3258#endif
3259
3260  rc = TEST_LOCK(lck, gtid);
3261#if USE_ITT_BUILD
3262  if (rc) {
3263    __kmp_itt_lock_acquired(lck);
3264  } else {
3265    __kmp_itt_lock_cancelled(lck);
3266  }
3267#endif /* USE_ITT_BUILD */
3268#if OMPT_SUPPORT && OMPT_OPTIONAL
3269  if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3270    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3271        ompt_mutex_test_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3272  }
3273#endif
3274
3275  return (rc ? FTN_TRUE : FTN_FALSE);
3276
3277  /* Can't use serial interval since not block structured */
3278
3279#endif // KMP_USE_DYNAMIC_LOCK
3280}
3281
3282/* try to acquire the lock */
3283int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3284#if KMP_USE_DYNAMIC_LOCK
3285  int rc;
3286#if USE_ITT_BUILD
3287  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3288#endif
3289#if OMPT_SUPPORT && OMPT_OPTIONAL
3290  // This is the case, if called from omp_init_lock_with_hint:
3291  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3292  if (!codeptr)
3293    codeptr = OMPT_GET_RETURN_ADDRESS(0);
3294  if (ompt_enabled.ompt_callback_mutex_acquire) {
3295    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3296        ompt_mutex_test_nest_lock, omp_lock_hint_none,
3297        __ompt_get_mutex_impl_type(user_lock),
3298        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3299  }
3300#endif
3301  rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3302#if USE_ITT_BUILD
3303  if (rc) {
3304    __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3305  } else {
3306    __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3307  }
3308#endif
3309#if OMPT_SUPPORT && OMPT_OPTIONAL
3310  if (ompt_enabled.enabled && rc) {
3311    if (rc == 1) {
3312      if (ompt_enabled.ompt_callback_mutex_acquired) {
3313        // lock_first
3314        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3315            ompt_mutex_test_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3316            codeptr);
3317      }
3318    } else {
3319      if (ompt_enabled.ompt_callback_nest_lock) {
3320        // lock_next
3321        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3322            ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3323      }
3324    }
3325  }
3326#endif
3327  return rc;
3328
3329#else // KMP_USE_DYNAMIC_LOCK
3330
3331  kmp_user_lock_p lck;
3332  int rc;
3333
3334  if ((__kmp_user_lock_kind == lk_tas) &&
3335      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3336       OMP_NEST_LOCK_T_SIZE)) {
3337    lck = (kmp_user_lock_p)user_lock;
3338  }
3339#if KMP_USE_FUTEX
3340  else if ((__kmp_user_lock_kind == lk_futex) &&
3341           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3342            OMP_NEST_LOCK_T_SIZE)) {
3343    lck = (kmp_user_lock_p)user_lock;
3344  }
3345#endif
3346  else {
3347    lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3348  }
3349
3350#if USE_ITT_BUILD
3351  __kmp_itt_lock_acquiring(lck);
3352#endif /* USE_ITT_BUILD */
3353
3354#if OMPT_SUPPORT && OMPT_OPTIONAL
3355  // This is the case, if called from omp_init_lock_with_hint:
3356  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3357  if (!codeptr)
3358    codeptr = OMPT_GET_RETURN_ADDRESS(0);
3359  if (ompt_enabled.enabled) &&
3360        ompt_enabled.ompt_callback_mutex_acquire) {
3361      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3362          ompt_mutex_test_nest_lock, omp_lock_hint_none,
3363          __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3364          codeptr);
3365    }
3366#endif
3367
3368  rc = TEST_NESTED_LOCK(lck, gtid);
3369#if USE_ITT_BUILD
3370  if (rc) {
3371    __kmp_itt_lock_acquired(lck);
3372  } else {
3373    __kmp_itt_lock_cancelled(lck);
3374  }
3375#endif /* USE_ITT_BUILD */
3376#if OMPT_SUPPORT && OMPT_OPTIONAL
3377  if (ompt_enabled.enabled && rc) {
3378    if (rc == 1) {
3379      if (ompt_enabled.ompt_callback_mutex_acquired) {
3380        // lock_first
3381        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3382            ompt_mutex_test_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3383      }
3384    } else {
3385      if (ompt_enabled.ompt_callback_nest_lock) {
3386        // lock_next
3387        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3388            ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3389      }
3390    }
3391  }
3392#endif
3393  return rc;
3394
3395  /* Can't use serial interval since not block structured */
3396
3397#endif // KMP_USE_DYNAMIC_LOCK
3398}
3399
3400// Interface to fast scalable reduce methods routines
3401
3402// keep the selected method in a thread local structure for cross-function
3403// usage: will be used in __kmpc_end_reduce* functions;
3404// another solution: to re-determine the method one more time in
3405// __kmpc_end_reduce* functions (new prototype required then)
3406// AT: which solution is better?
3407#define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3408  ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3409
3410#define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3411  (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3412
3413// description of the packed_reduction_method variable: look at the macros in
3414// kmp.h
3415
3416// used in a critical section reduce block
3417static __forceinline void
3418__kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3419                                          kmp_critical_name *crit) {
3420
3421  // this lock was visible to a customer and to the threading profile tool as a
3422  // serial overhead span (although it's used for an internal purpose only)
3423  //            why was it visible in previous implementation?
3424  //            should we keep it visible in new reduce block?
3425  kmp_user_lock_p lck;
3426
3427#if KMP_USE_DYNAMIC_LOCK
3428
3429  kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3430  // Check if it is initialized.
3431  if (*lk == 0) {
3432    if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3433      KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3434                                  KMP_GET_D_TAG(__kmp_user_lock_seq));
3435    } else {
3436      __kmp_init_indirect_csptr(crit, loc, global_tid,
3437                                KMP_GET_I_TAG(__kmp_user_lock_seq));
3438    }
3439  }
3440  // Branch for accessing the actual lock object and set operation. This
3441  // branching is inevitable since this lock initialization does not follow the
3442  // normal dispatch path (lock table is not used).
3443  if (KMP_EXTRACT_D_TAG(lk) != 0) {
3444    lck = (kmp_user_lock_p)lk;
3445    KMP_DEBUG_ASSERT(lck != NULL);
3446    if (__kmp_env_consistency_check) {
3447      __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3448    }
3449    KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3450  } else {
3451    kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3452    lck = ilk->lock;
3453    KMP_DEBUG_ASSERT(lck != NULL);
3454    if (__kmp_env_consistency_check) {
3455      __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3456    }
3457    KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3458  }
3459
3460#else // KMP_USE_DYNAMIC_LOCK
3461
3462  // We know that the fast reduction code is only emitted by Intel compilers
3463  // with 32 byte critical sections. If there isn't enough space, then we
3464  // have to use a pointer.
3465  if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3466    lck = (kmp_user_lock_p)crit;
3467  } else {
3468    lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3469  }
3470  KMP_DEBUG_ASSERT(lck != NULL);
3471
3472  if (__kmp_env_consistency_check)
3473    __kmp_push_sync(global_tid, ct_critical, loc, lck);
3474
3475  __kmp_acquire_user_lock_with_checks(lck, global_tid);
3476
3477#endif // KMP_USE_DYNAMIC_LOCK
3478}
3479
3480// used in a critical section reduce block
3481static __forceinline void
3482__kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3483                                        kmp_critical_name *crit) {
3484
3485  kmp_user_lock_p lck;
3486
3487#if KMP_USE_DYNAMIC_LOCK
3488
3489  if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3490    lck = (kmp_user_lock_p)crit;
3491    if (__kmp_env_consistency_check)
3492      __kmp_pop_sync(global_tid, ct_critical, loc);
3493    KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3494  } else {
3495    kmp_indirect_lock_t *ilk =
3496        (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3497    if (__kmp_env_consistency_check)
3498      __kmp_pop_sync(global_tid, ct_critical, loc);
3499    KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3500  }
3501
3502#else // KMP_USE_DYNAMIC_LOCK
3503
3504  // We know that the fast reduction code is only emitted by Intel compilers
3505  // with 32 byte critical sections. If there isn't enough space, then we have
3506  // to use a pointer.
3507  if (__kmp_base_user_lock_size > 32) {
3508    lck = *((kmp_user_lock_p *)crit);
3509    KMP_ASSERT(lck != NULL);
3510  } else {
3511    lck = (kmp_user_lock_p)crit;
3512  }
3513
3514  if (__kmp_env_consistency_check)
3515    __kmp_pop_sync(global_tid, ct_critical, loc);
3516
3517  __kmp_release_user_lock_with_checks(lck, global_tid);
3518
3519#endif // KMP_USE_DYNAMIC_LOCK
3520} // __kmp_end_critical_section_reduce_block
3521
3522static __forceinline int
3523__kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3524                                     int *task_state) {
3525  kmp_team_t *team;
3526
3527  // Check if we are inside the teams construct?
3528  if (th->th.th_teams_microtask) {
3529    *team_p = team = th->th.th_team;
3530    if (team->t.t_level == th->th.th_teams_level) {
3531      // This is reduction at teams construct.
3532      KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3533      // Let's swap teams temporarily for the reduction.
3534      th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3535      th->th.th_team = team->t.t_parent;
3536      th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3537      th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3538      *task_state = th->th.th_task_state;
3539      th->th.th_task_state = 0;
3540
3541      return 1;
3542    }
3543  }
3544  return 0;
3545}
3546
3547static __forceinline void
3548__kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3549  // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3550  th->th.th_info.ds.ds_tid = 0;
3551  th->th.th_team = team;
3552  th->th.th_team_nproc = team->t.t_nproc;
3553  th->th.th_task_team = team->t.t_task_team[task_state];
3554  __kmp_type_convert(task_state, &(th->th.th_task_state));
3555}
3556
3557/* 2.a.i. Reduce Block without a terminating barrier */
3558/*!
3559@ingroup SYNCHRONIZATION
3560@param loc source location information
3561@param global_tid global thread number
3562@param num_vars number of items (variables) to be reduced
3563@param reduce_size size of data in bytes to be reduced
3564@param reduce_data pointer to data to be reduced
3565@param reduce_func callback function providing reduction operation on two
3566operands and returning result of reduction in lhs_data
3567@param lck pointer to the unique lock data structure
3568@result 1 for the primary thread, 0 for all other team threads, 2 for all team
3569threads if atomic reduction needed
3570
3571The nowait version is used for a reduce clause with the nowait argument.
3572*/
3573kmp_int32
3574__kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3575                     size_t reduce_size, void *reduce_data,
3576                     void (*reduce_func)(void *lhs_data, void *rhs_data),
3577                     kmp_critical_name *lck) {
3578
3579  KMP_COUNT_BLOCK(REDUCE_nowait);
3580  int retval = 0;
3581  PACKED_REDUCTION_METHOD_T packed_reduction_method;
3582  kmp_info_t *th;
3583  kmp_team_t *team;
3584  int teams_swapped = 0, task_state;
3585  KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3586  __kmp_assert_valid_gtid(global_tid);
3587
3588  // why do we need this initialization here at all?
3589  // Reduction clause can not be used as a stand-alone directive.
3590
3591  // do not call __kmp_serial_initialize(), it will be called by
3592  // __kmp_parallel_initialize() if needed
3593  // possible detection of false-positive race by the threadchecker ???
3594  if (!TCR_4(__kmp_init_parallel))
3595    __kmp_parallel_initialize();
3596
3597  __kmp_resume_if_soft_paused();
3598
3599// check correctness of reduce block nesting
3600#if KMP_USE_DYNAMIC_LOCK
3601  if (__kmp_env_consistency_check)
3602    __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3603#else
3604  if (__kmp_env_consistency_check)
3605    __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3606#endif
3607
3608  th = __kmp_thread_from_gtid(global_tid);
3609  teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3610
3611  // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3612  // the value should be kept in a variable
3613  // the variable should be either a construct-specific or thread-specific
3614  // property, not a team specific property
3615  //     (a thread can reach the next reduce block on the next construct, reduce
3616  //     method may differ on the next construct)
3617  // an ident_t "loc" parameter could be used as a construct-specific property
3618  // (what if loc == 0?)
3619  //     (if both construct-specific and team-specific variables were shared,
3620  //     then unness extra syncs should be needed)
3621  // a thread-specific variable is better regarding two issues above (next
3622  // construct and extra syncs)
3623  // a thread-specific "th_local.reduction_method" variable is used currently
3624  // each thread executes 'determine' and 'set' lines (no need to execute by one
3625  // thread, to avoid unness extra syncs)
3626
3627  packed_reduction_method = __kmp_determine_reduction_method(
3628      loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3629  __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3630
3631  OMPT_REDUCTION_DECL(th, global_tid);
3632  if (packed_reduction_method == critical_reduce_block) {
3633
3634    OMPT_REDUCTION_BEGIN;
3635
3636    __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3637    retval = 1;
3638
3639  } else if (packed_reduction_method == empty_reduce_block) {
3640
3641    OMPT_REDUCTION_BEGIN;
3642
3643    // usage: if team size == 1, no synchronization is required ( Intel
3644    // platforms only )
3645    retval = 1;
3646
3647  } else if (packed_reduction_method == atomic_reduce_block) {
3648
3649    retval = 2;
3650
3651    // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3652    // won't be called by the code gen)
3653    //     (it's not quite good, because the checking block has been closed by
3654    //     this 'pop',
3655    //      but atomic operation has not been executed yet, will be executed
3656    //      slightly later, literally on next instruction)
3657    if (__kmp_env_consistency_check)
3658      __kmp_pop_sync(global_tid, ct_reduce, loc);
3659
3660  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3661                                   tree_reduce_block)) {
3662
3663// AT: performance issue: a real barrier here
3664// AT: (if primary thread is slow, other threads are blocked here waiting for
3665//      the primary thread to come and release them)
3666// AT: (it's not what a customer might expect specifying NOWAIT clause)
3667// AT: (specifying NOWAIT won't result in improvement of performance, it'll
3668//      be confusing to a customer)
3669// AT: another implementation of *barrier_gather*nowait() (or some other design)
3670// might go faster and be more in line with sense of NOWAIT
3671// AT: TO DO: do epcc test and compare times
3672
3673// this barrier should be invisible to a customer and to the threading profile
3674// tool (it's neither a terminating barrier nor customer's code, it's
3675// used for an internal purpose)
3676#if OMPT_SUPPORT
3677    // JP: can this barrier potentially leed to task scheduling?
3678    // JP: as long as there is a barrier in the implementation, OMPT should and
3679    // will provide the barrier events
3680    //         so we set-up the necessary frame/return addresses.
3681    ompt_frame_t *ompt_frame;
3682    if (ompt_enabled.enabled) {
3683      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3684      if (ompt_frame->enter_frame.ptr == NULL)
3685        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3686    }
3687    OMPT_STORE_RETURN_ADDRESS(global_tid);
3688#endif
3689#if USE_ITT_NOTIFY
3690    __kmp_threads[global_tid]->th.th_ident = loc;
3691#endif
3692    retval =
3693        __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3694                      global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3695    retval = (retval != 0) ? (0) : (1);
3696#if OMPT_SUPPORT && OMPT_OPTIONAL
3697    if (ompt_enabled.enabled) {
3698      ompt_frame->enter_frame = ompt_data_none;
3699    }
3700#endif
3701
3702    // all other workers except primary thread should do this pop here
3703    //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3704    if (__kmp_env_consistency_check) {
3705      if (retval == 0) {
3706        __kmp_pop_sync(global_tid, ct_reduce, loc);
3707      }
3708    }
3709
3710  } else {
3711
3712    // should never reach this block
3713    KMP_ASSERT(0); // "unexpected method"
3714  }
3715  if (teams_swapped) {
3716    __kmp_restore_swapped_teams(th, team, task_state);
3717  }
3718  KA_TRACE(
3719      10,
3720      ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3721       global_tid, packed_reduction_method, retval));
3722
3723  return retval;
3724}
3725
3726/*!
3727@ingroup SYNCHRONIZATION
3728@param loc source location information
3729@param global_tid global thread id.
3730@param lck pointer to the unique lock data structure
3731
3732Finish the execution of a reduce nowait.
3733*/
3734void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3735                              kmp_critical_name *lck) {
3736
3737  PACKED_REDUCTION_METHOD_T packed_reduction_method;
3738
3739  KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3740  __kmp_assert_valid_gtid(global_tid);
3741
3742  packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3743
3744  OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3745
3746  if (packed_reduction_method == critical_reduce_block) {
3747
3748    __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3749    OMPT_REDUCTION_END;
3750
3751  } else if (packed_reduction_method == empty_reduce_block) {
3752
3753    // usage: if team size == 1, no synchronization is required ( on Intel
3754    // platforms only )
3755
3756    OMPT_REDUCTION_END;
3757
3758  } else if (packed_reduction_method == atomic_reduce_block) {
3759
3760    // neither primary thread nor other workers should get here
3761    //     (code gen does not generate this call in case 2: atomic reduce block)
3762    // actually it's better to remove this elseif at all;
3763    // after removal this value will checked by the 'else' and will assert
3764
3765  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3766                                   tree_reduce_block)) {
3767
3768    // only primary thread gets here
3769    // OMPT: tree reduction is annotated in the barrier code
3770
3771  } else {
3772
3773    // should never reach this block
3774    KMP_ASSERT(0); // "unexpected method"
3775  }
3776
3777  if (__kmp_env_consistency_check)
3778    __kmp_pop_sync(global_tid, ct_reduce, loc);
3779
3780  KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3781                global_tid, packed_reduction_method));
3782
3783  return;
3784}
3785
3786/* 2.a.ii. Reduce Block with a terminating barrier */
3787
3788/*!
3789@ingroup SYNCHRONIZATION
3790@param loc source location information
3791@param global_tid global thread number
3792@param num_vars number of items (variables) to be reduced
3793@param reduce_size size of data in bytes to be reduced
3794@param reduce_data pointer to data to be reduced
3795@param reduce_func callback function providing reduction operation on two
3796operands and returning result of reduction in lhs_data
3797@param lck pointer to the unique lock data structure
3798@result 1 for the primary thread, 0 for all other team threads, 2 for all team
3799threads if atomic reduction needed
3800
3801A blocking reduce that includes an implicit barrier.
3802*/
3803kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3804                        size_t reduce_size, void *reduce_data,
3805                        void (*reduce_func)(void *lhs_data, void *rhs_data),
3806                        kmp_critical_name *lck) {
3807  KMP_COUNT_BLOCK(REDUCE_wait);
3808  int retval = 0;
3809  PACKED_REDUCTION_METHOD_T packed_reduction_method;
3810  kmp_info_t *th;
3811  kmp_team_t *team;
3812  int teams_swapped = 0, task_state;
3813
3814  KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3815  __kmp_assert_valid_gtid(global_tid);
3816
3817  // why do we need this initialization here at all?
3818  // Reduction clause can not be a stand-alone directive.
3819
3820  // do not call __kmp_serial_initialize(), it will be called by
3821  // __kmp_parallel_initialize() if needed
3822  // possible detection of false-positive race by the threadchecker ???
3823  if (!TCR_4(__kmp_init_parallel))
3824    __kmp_parallel_initialize();
3825
3826  __kmp_resume_if_soft_paused();
3827
3828// check correctness of reduce block nesting
3829#if KMP_USE_DYNAMIC_LOCK
3830  if (__kmp_env_consistency_check)
3831    __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3832#else
3833  if (__kmp_env_consistency_check)
3834    __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3835#endif
3836
3837  th = __kmp_thread_from_gtid(global_tid);
3838  teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3839
3840  packed_reduction_method = __kmp_determine_reduction_method(
3841      loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3842  __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3843
3844  OMPT_REDUCTION_DECL(th, global_tid);
3845
3846  if (packed_reduction_method == critical_reduce_block) {
3847
3848    OMPT_REDUCTION_BEGIN;
3849    __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3850    retval = 1;
3851
3852  } else if (packed_reduction_method == empty_reduce_block) {
3853
3854    OMPT_REDUCTION_BEGIN;
3855    // usage: if team size == 1, no synchronization is required ( Intel
3856    // platforms only )
3857    retval = 1;
3858
3859  } else if (packed_reduction_method == atomic_reduce_block) {
3860
3861    retval = 2;
3862
3863  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3864                                   tree_reduce_block)) {
3865
3866// case tree_reduce_block:
3867// this barrier should be visible to a customer and to the threading profile
3868// tool (it's a terminating barrier on constructs if NOWAIT not specified)
3869#if OMPT_SUPPORT
3870    ompt_frame_t *ompt_frame;
3871    if (ompt_enabled.enabled) {
3872      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3873      if (ompt_frame->enter_frame.ptr == NULL)
3874        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3875    }
3876    OMPT_STORE_RETURN_ADDRESS(global_tid);
3877#endif
3878#if USE_ITT_NOTIFY
3879    __kmp_threads[global_tid]->th.th_ident =
3880        loc; // needed for correct notification of frames
3881#endif
3882    retval =
3883        __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3884                      global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3885    retval = (retval != 0) ? (0) : (1);
3886#if OMPT_SUPPORT && OMPT_OPTIONAL
3887    if (ompt_enabled.enabled) {
3888      ompt_frame->enter_frame = ompt_data_none;
3889    }
3890#endif
3891
3892    // all other workers except primary thread should do this pop here
3893    // (none of other workers except primary will enter __kmpc_end_reduce())
3894    if (__kmp_env_consistency_check) {
3895      if (retval == 0) { // 0: all other workers; 1: primary thread
3896        __kmp_pop_sync(global_tid, ct_reduce, loc);
3897      }
3898    }
3899
3900  } else {
3901
3902    // should never reach this block
3903    KMP_ASSERT(0); // "unexpected method"
3904  }
3905  if (teams_swapped) {
3906    __kmp_restore_swapped_teams(th, team, task_state);
3907  }
3908
3909  KA_TRACE(10,
3910           ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3911            global_tid, packed_reduction_method, retval));
3912  return retval;
3913}
3914
3915/*!
3916@ingroup SYNCHRONIZATION
3917@param loc source location information
3918@param global_tid global thread id.
3919@param lck pointer to the unique lock data structure
3920
3921Finish the execution of a blocking reduce.
3922The <tt>lck</tt> pointer must be the same as that used in the corresponding
3923start function.
3924*/
3925void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3926                       kmp_critical_name *lck) {
3927
3928  PACKED_REDUCTION_METHOD_T packed_reduction_method;
3929  kmp_info_t *th;
3930  kmp_team_t *team;
3931  int teams_swapped = 0, task_state;
3932
3933  KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3934  __kmp_assert_valid_gtid(global_tid);
3935
3936  th = __kmp_thread_from_gtid(global_tid);
3937  teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3938
3939  packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3940
3941  // this barrier should be visible to a customer and to the threading profile
3942  // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3943  OMPT_REDUCTION_DECL(th, global_tid);
3944
3945  if (packed_reduction_method == critical_reduce_block) {
3946    __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3947
3948    OMPT_REDUCTION_END;
3949
3950// TODO: implicit barrier: should be exposed
3951#if OMPT_SUPPORT
3952    ompt_frame_t *ompt_frame;
3953    if (ompt_enabled.enabled) {
3954      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3955      if (ompt_frame->enter_frame.ptr == NULL)
3956        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3957    }
3958    OMPT_STORE_RETURN_ADDRESS(global_tid);
3959#endif
3960#if USE_ITT_NOTIFY
3961    __kmp_threads[global_tid]->th.th_ident = loc;
3962#endif
3963    __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3964#if OMPT_SUPPORT && OMPT_OPTIONAL
3965    if (ompt_enabled.enabled) {
3966      ompt_frame->enter_frame = ompt_data_none;
3967    }
3968#endif
3969
3970  } else if (packed_reduction_method == empty_reduce_block) {
3971
3972    OMPT_REDUCTION_END;
3973
3974// usage: if team size==1, no synchronization is required (Intel platforms only)
3975
3976// TODO: implicit barrier: should be exposed
3977#if OMPT_SUPPORT
3978    ompt_frame_t *ompt_frame;
3979    if (ompt_enabled.enabled) {
3980      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3981      if (ompt_frame->enter_frame.ptr == NULL)
3982        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3983    }
3984    OMPT_STORE_RETURN_ADDRESS(global_tid);
3985#endif
3986#if USE_ITT_NOTIFY
3987    __kmp_threads[global_tid]->th.th_ident = loc;
3988#endif
3989    __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3990#if OMPT_SUPPORT && OMPT_OPTIONAL
3991    if (ompt_enabled.enabled) {
3992      ompt_frame->enter_frame = ompt_data_none;
3993    }
3994#endif
3995
3996  } else if (packed_reduction_method == atomic_reduce_block) {
3997
3998#if OMPT_SUPPORT
3999    ompt_frame_t *ompt_frame;
4000    if (ompt_enabled.enabled) {
4001      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
4002      if (ompt_frame->enter_frame.ptr == NULL)
4003        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
4004    }
4005    OMPT_STORE_RETURN_ADDRESS(global_tid);
4006#endif
4007// TODO: implicit barrier: should be exposed
4008#if USE_ITT_NOTIFY
4009    __kmp_threads[global_tid]->th.th_ident = loc;
4010#endif
4011    __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
4012#if OMPT_SUPPORT && OMPT_OPTIONAL
4013    if (ompt_enabled.enabled) {
4014      ompt_frame->enter_frame = ompt_data_none;
4015    }
4016#endif
4017
4018  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
4019                                   tree_reduce_block)) {
4020
4021    // only primary thread executes here (primary releases all other workers)
4022    __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
4023                            global_tid);
4024
4025  } else {
4026
4027    // should never reach this block
4028    KMP_ASSERT(0); // "unexpected method"
4029  }
4030  if (teams_swapped) {
4031    __kmp_restore_swapped_teams(th, team, task_state);
4032  }
4033
4034  if (__kmp_env_consistency_check)
4035    __kmp_pop_sync(global_tid, ct_reduce, loc);
4036
4037  KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
4038                global_tid, packed_reduction_method));
4039
4040  return;
4041}
4042
4043#undef __KMP_GET_REDUCTION_METHOD
4044#undef __KMP_SET_REDUCTION_METHOD
4045
4046/* end of interface to fast scalable reduce routines */
4047
4048kmp_uint64 __kmpc_get_taskid() {
4049
4050  kmp_int32 gtid;
4051  kmp_info_t *thread;
4052
4053  gtid = __kmp_get_gtid();
4054  if (gtid < 0) {
4055    return 0;
4056  }
4057  thread = __kmp_thread_from_gtid(gtid);
4058  return thread->th.th_current_task->td_task_id;
4059
4060} // __kmpc_get_taskid
4061
4062kmp_uint64 __kmpc_get_parent_taskid() {
4063
4064  kmp_int32 gtid;
4065  kmp_info_t *thread;
4066  kmp_taskdata_t *parent_task;
4067
4068  gtid = __kmp_get_gtid();
4069  if (gtid < 0) {
4070    return 0;
4071  }
4072  thread = __kmp_thread_from_gtid(gtid);
4073  parent_task = thread->th.th_current_task->td_parent;
4074  return (parent_task == NULL ? 0 : parent_task->td_task_id);
4075
4076} // __kmpc_get_parent_taskid
4077
4078/*!
4079@ingroup WORK_SHARING
4080@param loc  source location information.
4081@param gtid  global thread number.
4082@param num_dims  number of associated doacross loops.
4083@param dims  info on loops bounds.
4084
4085Initialize doacross loop information.
4086Expect compiler send us inclusive bounds,
4087e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
4088*/
4089void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
4090                          const struct kmp_dim *dims) {
4091  __kmp_assert_valid_gtid(gtid);
4092  int j, idx;
4093  kmp_int64 last, trace_count;
4094  kmp_info_t *th = __kmp_threads[gtid];
4095  kmp_team_t *team = th->th.th_team;
4096  kmp_uint32 *flags;
4097  kmp_disp_t *pr_buf = th->th.th_dispatch;
4098  dispatch_shared_info_t *sh_buf;
4099
4100  KA_TRACE(
4101      20,
4102      ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
4103       gtid, num_dims, !team->t.t_serialized));
4104  KMP_DEBUG_ASSERT(dims != NULL);
4105  KMP_DEBUG_ASSERT(num_dims > 0);
4106
4107  if (team->t.t_serialized) {
4108    KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
4109    return; // no dependencies if team is serialized
4110  }
4111  KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
4112  idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
4113  // the next loop
4114  sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4115
4116  // Save bounds info into allocated private buffer
4117  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
4118  pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
4119      th, sizeof(kmp_int64) * (4 * num_dims + 1));
4120  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4121  pr_buf->th_doacross_info[0] =
4122      (kmp_int64)num_dims; // first element is number of dimensions
4123  // Save also address of num_done in order to access it later without knowing
4124  // the buffer index
4125  pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
4126  pr_buf->th_doacross_info[2] = dims[0].lo;
4127  pr_buf->th_doacross_info[3] = dims[0].up;
4128  pr_buf->th_doacross_info[4] = dims[0].st;
4129  last = 5;
4130  for (j = 1; j < num_dims; ++j) {
4131    kmp_int64
4132        range_length; // To keep ranges of all dimensions but the first dims[0]
4133    if (dims[j].st == 1) { // most common case
4134      // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
4135      range_length = dims[j].up - dims[j].lo + 1;
4136    } else {
4137      if (dims[j].st > 0) {
4138        KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
4139        range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
4140      } else { // negative increment
4141        KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
4142        range_length =
4143            (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
4144      }
4145    }
4146    pr_buf->th_doacross_info[last++] = range_length;
4147    pr_buf->th_doacross_info[last++] = dims[j].lo;
4148    pr_buf->th_doacross_info[last++] = dims[j].up;
4149    pr_buf->th_doacross_info[last++] = dims[j].st;
4150  }
4151
4152  // Compute total trip count.
4153  // Start with range of dims[0] which we don't need to keep in the buffer.
4154  if (dims[0].st == 1) { // most common case
4155    trace_count = dims[0].up - dims[0].lo + 1;
4156  } else if (dims[0].st > 0) {
4157    KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
4158    trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
4159  } else { // negative increment
4160    KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
4161    trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
4162  }
4163  for (j = 1; j < num_dims; ++j) {
4164    trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
4165  }
4166  KMP_DEBUG_ASSERT(trace_count > 0);
4167
4168  // Check if shared buffer is not occupied by other loop (idx -
4169  // __kmp_dispatch_num_buffers)
4170  if (idx != sh_buf->doacross_buf_idx) {
4171    // Shared buffer is occupied, wait for it to be free
4172    __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
4173                 __kmp_eq_4, NULL);
4174  }
4175#if KMP_32_BIT_ARCH
4176  // Check if we are the first thread. After the CAS the first thread gets 0,
4177  // others get 1 if initialization is in progress, allocated pointer otherwise.
4178  // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
4179  flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
4180      (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
4181#else
4182  flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
4183      (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
4184#endif
4185  if (flags == NULL) {
4186    // we are the first thread, allocate the array of flags
4187    size_t size =
4188        (size_t)trace_count / 8 + 8; // in bytes, use single bit per iteration
4189    flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
4190    KMP_MB();
4191    sh_buf->doacross_flags = flags;
4192  } else if (flags == (kmp_uint32 *)1) {
4193#if KMP_32_BIT_ARCH
4194    // initialization is still in progress, need to wait
4195    while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
4196#else
4197    while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
4198#endif
4199      KMP_YIELD(TRUE);
4200    KMP_MB();
4201  } else {
4202    KMP_MB();
4203  }
4204  KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4205  pr_buf->th_doacross_flags =
4206      sh_buf->doacross_flags; // save private copy in order to not
4207  // touch shared buffer on each iteration
4208  KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4209}
4210
4211void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4212  __kmp_assert_valid_gtid(gtid);
4213  kmp_int64 shft;
4214  size_t num_dims, i;
4215  kmp_uint32 flag;
4216  kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4217  kmp_info_t *th = __kmp_threads[gtid];
4218  kmp_team_t *team = th->th.th_team;
4219  kmp_disp_t *pr_buf;
4220  kmp_int64 lo, up, st;
4221
4222  KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4223  if (team->t.t_serialized) {
4224    KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4225    return; // no dependencies if team is serialized
4226  }
4227
4228  // calculate sequential iteration number and check out-of-bounds condition
4229  pr_buf = th->th.th_dispatch;
4230  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4231  num_dims = (size_t)pr_buf->th_doacross_info[0];
4232  lo = pr_buf->th_doacross_info[2];
4233  up = pr_buf->th_doacross_info[3];
4234  st = pr_buf->th_doacross_info[4];
4235#if OMPT_SUPPORT && OMPT_OPTIONAL
4236  ompt_dependence_t deps[num_dims];
4237#endif
4238  if (st == 1) { // most common case
4239    if (vec[0] < lo || vec[0] > up) {
4240      KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4241                    "bounds [%lld,%lld]\n",
4242                    gtid, vec[0], lo, up));
4243      return;
4244    }
4245    iter_number = vec[0] - lo;
4246  } else if (st > 0) {
4247    if (vec[0] < lo || vec[0] > up) {
4248      KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4249                    "bounds [%lld,%lld]\n",
4250                    gtid, vec[0], lo, up));
4251      return;
4252    }
4253    iter_number = (kmp_uint64)(vec[0] - lo) / st;
4254  } else { // negative increment
4255    if (vec[0] > lo || vec[0] < up) {
4256      KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4257                    "bounds [%lld,%lld]\n",
4258                    gtid, vec[0], lo, up));
4259      return;
4260    }
4261    iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4262  }
4263#if OMPT_SUPPORT && OMPT_OPTIONAL
4264  deps[0].variable.value = iter_number;
4265  deps[0].dependence_type = ompt_dependence_type_sink;
4266#endif
4267  for (i = 1; i < num_dims; ++i) {
4268    kmp_int64 iter, ln;
4269    size_t j = i * 4;
4270    ln = pr_buf->th_doacross_info[j + 1];
4271    lo = pr_buf->th_doacross_info[j + 2];
4272    up = pr_buf->th_doacross_info[j + 3];
4273    st = pr_buf->th_doacross_info[j + 4];
4274    if (st == 1) {
4275      if (vec[i] < lo || vec[i] > up) {
4276        KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4277                      "bounds [%lld,%lld]\n",
4278                      gtid, vec[i], lo, up));
4279        return;
4280      }
4281      iter = vec[i] - lo;
4282    } else if (st > 0) {
4283      if (vec[i] < lo || vec[i] > up) {
4284        KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4285                      "bounds [%lld,%lld]\n",
4286                      gtid, vec[i], lo, up));
4287        return;
4288      }
4289      iter = (kmp_uint64)(vec[i] - lo) / st;
4290    } else { // st < 0
4291      if (vec[i] > lo || vec[i] < up) {
4292        KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4293                      "bounds [%lld,%lld]\n",
4294                      gtid, vec[i], lo, up));
4295        return;
4296      }
4297      iter = (kmp_uint64)(lo - vec[i]) / (-st);
4298    }
4299    iter_number = iter + ln * iter_number;
4300#if OMPT_SUPPORT && OMPT_OPTIONAL
4301    deps[i].variable.value = iter;
4302    deps[i].dependence_type = ompt_dependence_type_sink;
4303#endif
4304  }
4305  shft = iter_number % 32; // use 32-bit granularity
4306  iter_number >>= 5; // divided by 32
4307  flag = 1 << shft;
4308  while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4309    KMP_YIELD(TRUE);
4310  }
4311  KMP_MB();
4312#if OMPT_SUPPORT && OMPT_OPTIONAL
4313  if (ompt_enabled.ompt_callback_dependences) {
4314    ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4315        &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4316  }
4317#endif
4318  KA_TRACE(20,
4319           ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4320            gtid, (iter_number << 5) + shft));
4321}
4322
4323void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4324  __kmp_assert_valid_gtid(gtid);
4325  kmp_int64 shft;
4326  size_t num_dims, i;
4327  kmp_uint32 flag;
4328  kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4329  kmp_info_t *th = __kmp_threads[gtid];
4330  kmp_team_t *team = th->th.th_team;
4331  kmp_disp_t *pr_buf;
4332  kmp_int64 lo, st;
4333
4334  KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4335  if (team->t.t_serialized) {
4336    KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4337    return; // no dependencies if team is serialized
4338  }
4339
4340  // calculate sequential iteration number (same as in "wait" but no
4341  // out-of-bounds checks)
4342  pr_buf = th->th.th_dispatch;
4343  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4344  num_dims = (size_t)pr_buf->th_doacross_info[0];
4345  lo = pr_buf->th_doacross_info[2];
4346  st = pr_buf->th_doacross_info[4];
4347#if OMPT_SUPPORT && OMPT_OPTIONAL
4348  ompt_dependence_t deps[num_dims];
4349#endif
4350  if (st == 1) { // most common case
4351    iter_number = vec[0] - lo;
4352  } else if (st > 0) {
4353    iter_number = (kmp_uint64)(vec[0] - lo) / st;
4354  } else { // negative increment
4355    iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4356  }
4357#if OMPT_SUPPORT && OMPT_OPTIONAL
4358  deps[0].variable.value = iter_number;
4359  deps[0].dependence_type = ompt_dependence_type_source;
4360#endif
4361  for (i = 1; i < num_dims; ++i) {
4362    kmp_int64 iter, ln;
4363    size_t j = i * 4;
4364    ln = pr_buf->th_doacross_info[j + 1];
4365    lo = pr_buf->th_doacross_info[j + 2];
4366    st = pr_buf->th_doacross_info[j + 4];
4367    if (st == 1) {
4368      iter = vec[i] - lo;
4369    } else if (st > 0) {
4370      iter = (kmp_uint64)(vec[i] - lo) / st;
4371    } else { // st < 0
4372      iter = (kmp_uint64)(lo - vec[i]) / (-st);
4373    }
4374    iter_number = iter + ln * iter_number;
4375#if OMPT_SUPPORT && OMPT_OPTIONAL
4376    deps[i].variable.value = iter;
4377    deps[i].dependence_type = ompt_dependence_type_source;
4378#endif
4379  }
4380#if OMPT_SUPPORT && OMPT_OPTIONAL
4381  if (ompt_enabled.ompt_callback_dependences) {
4382    ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4383        &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4384  }
4385#endif
4386  shft = iter_number % 32; // use 32-bit granularity
4387  iter_number >>= 5; // divided by 32
4388  flag = 1 << shft;
4389  KMP_MB();
4390  if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4391    KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4392  KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4393                (iter_number << 5) + shft));
4394}
4395
4396void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4397  __kmp_assert_valid_gtid(gtid);
4398  kmp_int32 num_done;
4399  kmp_info_t *th = __kmp_threads[gtid];
4400  kmp_team_t *team = th->th.th_team;
4401  kmp_disp_t *pr_buf = th->th.th_dispatch;
4402
4403  KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4404  if (team->t.t_serialized) {
4405    KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4406    return; // nothing to do
4407  }
4408  num_done =
4409      KMP_TEST_THEN_INC32((kmp_uintptr_t)(pr_buf->th_doacross_info[1])) + 1;
4410  if (num_done == th->th.th_team_nproc) {
4411    // we are the last thread, need to free shared resources
4412    int idx = pr_buf->th_doacross_buf_idx - 1;
4413    dispatch_shared_info_t *sh_buf =
4414        &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4415    KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4416                     (kmp_int64)&sh_buf->doacross_num_done);
4417    KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4418    KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4419    __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4420    sh_buf->doacross_flags = NULL;
4421    sh_buf->doacross_num_done = 0;
4422    sh_buf->doacross_buf_idx +=
4423        __kmp_dispatch_num_buffers; // free buffer for future re-use
4424  }
4425  // free private resources (need to keep buffer index forever)
4426  pr_buf->th_doacross_flags = NULL;
4427  __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4428  pr_buf->th_doacross_info = NULL;
4429  KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4430}
4431
4432/* OpenMP 5.1 Memory Management routines */
4433void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4434  return __kmp_alloc(__kmp_entry_gtid(), 0, size, allocator);
4435}
4436
4437void *omp_aligned_alloc(size_t align, size_t size,
4438                        omp_allocator_handle_t allocator) {
4439  return __kmp_alloc(__kmp_entry_gtid(), align, size, allocator);
4440}
4441
4442void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
4443  return __kmp_calloc(__kmp_entry_gtid(), 0, nmemb, size, allocator);
4444}
4445
4446void *omp_aligned_calloc(size_t align, size_t nmemb, size_t size,
4447                         omp_allocator_handle_t allocator) {
4448  return __kmp_calloc(__kmp_entry_gtid(), align, nmemb, size, allocator);
4449}
4450
4451void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
4452                  omp_allocator_handle_t free_allocator) {
4453  return __kmp_realloc(__kmp_entry_gtid(), ptr, size, allocator,
4454                       free_allocator);
4455}
4456
4457void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4458  ___kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4459}
4460/* end of OpenMP 5.1 Memory Management routines */
4461
4462int __kmpc_get_target_offload(void) {
4463  if (!__kmp_init_serial) {
4464    __kmp_serial_initialize();
4465  }
4466  return __kmp_target_offload;
4467}
4468
4469int __kmpc_pause_resource(kmp_pause_status_t level) {
4470  if (!__kmp_init_serial) {
4471    return 1; // Can't pause if runtime is not initialized
4472  }
4473  return __kmp_pause_resource(level);
4474}
4475
4476void __kmpc_error(ident_t *loc, int severity, const char *message) {
4477  if (!__kmp_init_serial)
4478    __kmp_serial_initialize();
4479
4480  KMP_ASSERT(severity == severity_warning || severity == severity_fatal);
4481
4482#if OMPT_SUPPORT
4483  if (ompt_enabled.enabled && ompt_enabled.ompt_callback_error) {
4484    ompt_callbacks.ompt_callback(ompt_callback_error)(
4485        (ompt_severity_t)severity, message, KMP_STRLEN(message),
4486        OMPT_GET_RETURN_ADDRESS(0));
4487  }
4488#endif // OMPT_SUPPORT
4489
4490  char *src_loc;
4491  if (loc && loc->psource) {
4492    kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, false);
4493    src_loc =
4494        __kmp_str_format("%s:%d:%d", str_loc.file, str_loc.line, str_loc.col);
4495    __kmp_str_loc_free(&str_loc);
4496  } else {
4497    src_loc = __kmp_str_format("unknown");
4498  }
4499
4500  if (severity == severity_warning)
4501    KMP_WARNING(UserDirectedWarning, src_loc, message);
4502  else
4503    KMP_FATAL(UserDirectedError, src_loc, message);
4504
4505  __kmp_str_free(&src_loc);
4506}
4507
4508// Mark begin of scope directive.
4509void __kmpc_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
4510// reserved is for extension of scope directive and not used.
4511#if OMPT_SUPPORT && OMPT_OPTIONAL
4512  if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
4513    kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
4514    int tid = __kmp_tid_from_gtid(gtid);
4515    ompt_callbacks.ompt_callback(ompt_callback_work)(
4516        ompt_work_scope, ompt_scope_begin,
4517        &(team->t.ompt_team_info.parallel_data),
4518        &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
4519        OMPT_GET_RETURN_ADDRESS(0));
4520  }
4521#endif // OMPT_SUPPORT && OMPT_OPTIONAL
4522}
4523
4524// Mark end of scope directive
4525void __kmpc_end_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
4526// reserved is for extension of scope directive and not used.
4527#if OMPT_SUPPORT && OMPT_OPTIONAL
4528  if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
4529    kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
4530    int tid = __kmp_tid_from_gtid(gtid);
4531    ompt_callbacks.ompt_callback(ompt_callback_work)(
4532        ompt_work_scope, ompt_scope_end,
4533        &(team->t.ompt_team_info.parallel_data),
4534        &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
4535        OMPT_GET_RETURN_ADDRESS(0));
4536  }
4537#endif // OMPT_SUPPORT && OMPT_OPTIONAL
4538}
4539
4540#ifdef KMP_USE_VERSION_SYMBOLS
4541// For GOMP compatibility there are two versions of each omp_* API.
4542// One is the plain C symbol and one is the Fortran symbol with an appended
4543// underscore. When we implement a specific ompc_* version of an omp_*
4544// function, we want the plain GOMP versioned symbol to alias the ompc_* version
4545// instead of the Fortran versions in kmp_ftn_entry.h
4546extern "C" {
4547// Have to undef these from omp.h so they aren't translated into
4548// their ompc counterparts in the KMP_VERSION_OMPC_SYMBOL macros below
4549#ifdef omp_set_affinity_format
4550#undef omp_set_affinity_format
4551#endif
4552#ifdef omp_get_affinity_format
4553#undef omp_get_affinity_format
4554#endif
4555#ifdef omp_display_affinity
4556#undef omp_display_affinity
4557#endif
4558#ifdef omp_capture_affinity
4559#undef omp_capture_affinity
4560#endif
4561KMP_VERSION_OMPC_SYMBOL(ompc_set_affinity_format, omp_set_affinity_format, 50,
4562                        "OMP_5.0");
4563KMP_VERSION_OMPC_SYMBOL(ompc_get_affinity_format, omp_get_affinity_format, 50,
4564                        "OMP_5.0");
4565KMP_VERSION_OMPC_SYMBOL(ompc_display_affinity, omp_display_affinity, 50,
4566                        "OMP_5.0");
4567KMP_VERSION_OMPC_SYMBOL(ompc_capture_affinity, omp_capture_affinity, 50,
4568                        "OMP_5.0");
4569} // extern "C"
4570#endif
4571