199026Sjulian/* scheduler.c                  -*-C-*-
299026Sjulian *
399026Sjulian *************************************************************************
499026Sjulian *
599026Sjulian *  @copyright
699026Sjulian *  Copyright (C) 2007-2013, Intel Corporation
799026Sjulian *  All rights reserved.
899026Sjulian *
999026Sjulian *  @copyright
1099026Sjulian *  Redistribution and use in source and binary forms, with or without
1199026Sjulian *  modification, are permitted provided that the following conditions
1299026Sjulian *  are met:
1399026Sjulian *
1499026Sjulian *    * Redistributions of source code must retain the above copyright
1599026Sjulian *      notice, this list of conditions and the following disclaimer.
1699026Sjulian *    * Redistributions in binary form must reproduce the above copyright
1799026Sjulian *      notice, this list of conditions and the following disclaimer in
1899026Sjulian *      the documentation and/or other materials provided with the
1999026Sjulian *      distribution.
2099026Sjulian *    * Neither the name of Intel Corporation nor the names of its
2199026Sjulian *      contributors may be used to endorse or promote products derived
2299026Sjulian *      from this software without specific prior written permission.
2399026Sjulian *
2499026Sjulian *  @copyright
2599026Sjulian *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
2699026Sjulian *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
2799026Sjulian *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
2899026Sjulian *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29116182Sobrien *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
30116182Sobrien *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
31116182Sobrien *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
3299026Sjulian *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
3399026Sjulian *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
3499026Sjulian *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
3599026Sjulian *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
3699026Sjulian *  POSSIBILITY OF SUCH DAMAGE.
3799026Sjulian *
3899026Sjulian **************************************************************************/
39107029Sjulian
4099026Sjulian/*
41105854Sjulian * Cilk scheduler
4299026Sjulian */
43107126Sjeff
4499026Sjulian#include "scheduler.h"
4599026Sjulian#include "bug.h"
46107126Sjeff#include "os.h"
4799026Sjulian#include "os_mutex.h"
4899026Sjulian#include "local_state.h"
4999026Sjulian#include "signal_node.h"
5099026Sjulian#include "full_frame.h"
51103410Smini#include "sysdep.h"
5299026Sjulian#include "except.h"
5399026Sjulian#include "cilk_malloc.h"
54116355Salc#include "pedigrees.h"
5599026Sjulian#include "record-replay.h"
5699026Sjulian
5799026Sjulian#include <limits.h>
5899026Sjulian#include <string.h> /* memcpy */
5999026Sjulian#include <stdio.h>  // sprintf
60100273Speter#include <stdlib.h> // malloc, free, abort
61100273Speter
6299026Sjulian#ifdef _WIN32
63103367Sjulian#   pragma warning(disable:1786)   // disable warning: sprintf is deprecated
6499026Sjulian#   include "sysdep-win.h"
65103367Sjulian#   include "except-win32.h"
66103367Sjulian#endif  // _WIN32
6799026Sjulian
68111028Sjeff// ICL: Don't complain about conversion from pointer to same-sized integral
6999026Sjulian// type in __cilkrts_put_stack.  That's why we're using ptrdiff_t
70103367Sjulian#ifdef _WIN32
7199026Sjulian#   pragma warning(disable: 1684)
72107719Sjulian#endif
73107719Sjulian
74107719Sjulian#include "cilk/cilk_api.h"
7599026Sjulian#include "frame_malloc.h"
76114268Sdavidxu#include "metacall_impl.h"
77107006Sdavidxu#include "reducer_impl.h"
78103367Sjulian#include "cilk-tbb-interop.h"
79103367Sjulian#include "cilk-ittnotify.h"
80114268Sdavidxu#include "stats.h"
81107006Sdavidxu
82107006Sdavidxu// ICL: Don't complain about loss of precision in myrand
83107006Sdavidxu// I tried restoring the warning after the function, but it didn't
84111115Sdavidxu// suppress it
85111115Sdavidxu#ifdef _WIN32
86111115Sdavidxu#   pragma warning(disable: 2259)
87111115Sdavidxu#endif
88111028Sjeff
89111028Sjeff#ifndef _WIN32
9099026Sjulian#   include <unistd.h>
9199026Sjulian#endif
92111028Sjeff
93105854Sjulian#ifdef __VXWORKS__
94105854Sjulian// redeclare longjmp() with noreturn to stop warnings
95111028Sjeffextern __attribute__((noreturn))
96111028Sjeff		void longjmp(jmp_buf, int);
97111028Sjeff#endif
98111028Sjeff
9999026Sjulian//#define DEBUG_LOCKS 1
100107719Sjulian#ifdef DEBUG_LOCKS
101111028Sjeff// The currently executing worker must own this worker's lock
102111515Sdavidxu#   define ASSERT_WORKER_LOCK_OWNED(w) \
103111028Sjeff        { \
104105854Sjulian            __cilkrts_worker *tls_worker = __cilkrts_get_tls_worker(); \
105111028Sjeff            CILK_ASSERT((w)->l->lock.owner == tls_worker); \
106111028Sjeff        }
107111028Sjeff#else
108111028Sjeff#   define ASSERT_WORKER_LOCK_OWNED(w)
109111028Sjeff#endif // DEBUG_LOCKS
110111028Sjeff
111111028Sjeff// Options for the scheduler.
112111028Sjeffenum schedule_t { SCHEDULE_RUN,
113111028Sjeff                  SCHEDULE_WAIT,
114111028Sjeff                  SCHEDULE_EXIT };
115111028Sjeff
116111028Sjeff// Return values for provably_good_steal()
117111028Sjeffenum provably_good_steal_t
118111028Sjeff{
119111028Sjeff    ABANDON_EXECUTION,  // Not the last child to the sync - attempt to steal work
120111028Sjeff    CONTINUE_EXECUTION, // Last child to the sync - continue executing on this worker
121111028Sjeff    WAIT_FOR_CONTINUE   // The replay log indicates that this was the worker
122111028Sjeff                        // which continued.  Loop until we are the last worker
123111028Sjeff                        // to the sync.
124111028Sjeff};
125111028Sjeff
126111028Sjeff
127111028Sjeff// Verify that "w" is the worker we are currently executing on.
128111028Sjeff// Because this check is expensive, this method is usually a no-op.
129111028Sjeffstatic inline void verify_current_wkr(__cilkrts_worker *w)
130111028Sjeff{
131111028Sjeff#if ((REDPAR_DEBUG >= 3) || (FIBER_DEBUG >= 1))
132111028Sjeff    // Lookup the worker from TLS and compare to w.
133111028Sjeff    __cilkrts_worker* tmp = __cilkrts_get_tls_worker();
13499026Sjulian    if (w != tmp) {
135107719Sjulian        fprintf(stderr, "Error.  W=%d, actual worker =%d...\n",
13699026Sjulian                w->self,
13799026Sjulian                tmp->self);
13899026Sjulian    }
13999026Sjulian    CILK_ASSERT(w == tmp);
14099026Sjulian#endif
14199026Sjulian}
14299026Sjulian
143103216Sjulianstatic enum schedule_t worker_runnable(__cilkrts_worker *w);
144113339Sjulian
14599026Sjulian// Scheduling-fiber functions:
14699026Sjulianstatic void do_return_from_spawn (__cilkrts_worker *w,
14799026Sjulian                                  full_frame *ff,
14899026Sjulian                                  __cilkrts_stack_frame *sf);
14999026Sjulianstatic void do_sync (__cilkrts_worker *w,
15099026Sjulian                     full_frame *ff,
15199026Sjulian                     __cilkrts_stack_frame *sf);
15299026Sjulian
15399026Sjulian// max is defined on Windows and VxWorks
15499026Sjulian#if (! defined(_WIN32)) && (! defined(__VXWORKS__))
15599026Sjulian    // TBD: definition of max() for Linux.
15699026Sjulian#   define max(a, b) ((a) < (b) ? (b) : (a))
15799026Sjulian#endif
15899026Sjulian
15999026Sjulianvoid __cilkrts_dump_stats_to_stderr(global_state_t *g)
160103216Sjulian{
161103216Sjulian#ifdef CILK_PROFILE
162103216Sjulian    int i;
16399026Sjulian    for (i = 0; i < g->total_workers; ++i) {
16499026Sjulian        // Print out statistics for each worker.  We collected them,
16599026Sjulian        // so why not print them out?
16699026Sjulian        fprintf(stderr, "Stats for worker %d\n", i);
16799026Sjulian        dump_stats_to_file(stderr, g->workers[i]->l->stats);
16899026Sjulian        __cilkrts_accum_stats(&g->stats, g->workers[i]->l->stats);
16999026Sjulian    }
170103216Sjulian
17199026Sjulian    // Also print out aggregate statistics.
17299026Sjulian    dump_stats_to_file(stderr, &g->stats);
17399026Sjulian#endif
17499026Sjulian    fprintf(stderr,
17599026Sjulian            "CILK PLUS Thread Info: P=%d, Q=%d\n",
17699026Sjulian            g->P,
17799026Sjulian            g->Q);
17899026Sjulian    fprintf(stderr,
17999026Sjulian            "CILK PLUS RUNTIME MEMORY USAGE: %lld bytes",
18099026Sjulian            (long long)g->frame_malloc.allocated_from_os);
18199026Sjulian#ifdef CILK_PROFILE
18299026Sjulian    if (g->stats.stack_hwm)
18399026Sjulian        fprintf(stderr, ", %ld stacks", g->stats.stack_hwm);
18499026Sjulian#endif
18599026Sjulian    fputc('\n', stderr);
18699026Sjulian}
18799026Sjulian
188103312Sjulianstatic void validate_worker(__cilkrts_worker *w)
189116355Salc{
190103312Sjulian    /* check the magic numbers, for debugging purposes */
19199026Sjulian    if (w->l->worker_magic_0 != WORKER_MAGIC_0 ||
192107126Sjeff        w->l->worker_magic_1 != WORKER_MAGIC_1)
19399026Sjulian        abort_because_rts_is_corrupted();
19499026Sjulian}
19599026Sjulian
19699026Sjulianstatic void double_link(full_frame *left_ff, full_frame *right_ff)
19799026Sjulian{
19899026Sjulian    if (left_ff)
19999026Sjulian        left_ff->right_sibling = right_ff;
20099026Sjulian    if (right_ff)
20199026Sjulian        right_ff->left_sibling = left_ff;
20299026Sjulian}
20399026Sjulian
204116355Salc/* add CHILD to the right of all children of PARENT */
20599026Sjulianstatic void push_child(full_frame *parent_ff, full_frame *child_ff)
206111028Sjeff{
207107126Sjeff    double_link(parent_ff->rightmost_child, child_ff);
208107126Sjeff    double_link(child_ff, 0);
209107126Sjeff    parent_ff->rightmost_child = child_ff;
210107126Sjeff}
211107126Sjeff
212107126Sjeff/* unlink CHILD from the list of all children of PARENT */
213107126Sjeffstatic void unlink_child(full_frame *parent_ff, full_frame *child_ff)
21499026Sjulian{
215107126Sjeff    double_link(child_ff->left_sibling, child_ff->right_sibling);
216107126Sjeff
217107126Sjeff    if (!child_ff->right_sibling) {
218111028Sjeff        /* this is the rightmost child -- update parent link */
219107126Sjeff        CILK_ASSERT(parent_ff->rightmost_child == child_ff);
220107126Sjeff        parent_ff->rightmost_child = child_ff->left_sibling;
221107126Sjeff    }
222107126Sjeff    child_ff->left_sibling = child_ff->right_sibling = 0; /* paranoia */
223107126Sjeff}
224107126Sjeff
225107126Sjeffstatic void incjoin(full_frame *ff)
226107126Sjeff{
227107126Sjeff    ++ff->join_counter;
228107126Sjeff}
229107126Sjeff
230107126Sjeffstatic int decjoin(full_frame *ff)
231105854Sjulian{
232111028Sjeff    CILK_ASSERT(ff->join_counter > 0);
233105854Sjulian    return (--ff->join_counter);
234105854Sjulian}
235105854Sjulian
236105854Sjulianstatic int simulate_decjoin(full_frame *ff)
237105854Sjulian{
238105854Sjulian  CILK_ASSERT(ff->join_counter > 0);
239105854Sjulian  return (ff->join_counter - 1);
240105854Sjulian}
241111028Sjeff
242105854Sjulian/*
243105854Sjulian * Pseudo-random generator defined by the congruence S' = 69070 * S
244105854Sjulian * mod (2^32 - 5).  Marsaglia (CACM July 1993) says on page 107 that
245111028Sjeff * this is a ``good one''.  There you go.
246111028Sjeff *
247105854Sjulian * The literature makes a big fuss about avoiding the division, but
248105854Sjulian * for us it is not worth the hassle.
249105854Sjulian */
250105854Sjulianstatic const unsigned RNGMOD = ((1ULL << 32) - 5);
251105854Sjulianstatic const unsigned RNGMUL = 69070U;
252105854Sjulian
253105854Sjulianstatic unsigned myrand(__cilkrts_worker *w)
254105854Sjulian{
255105854Sjulian    unsigned state = w->l->rand_seed;
256105854Sjulian    state = (unsigned)((RNGMUL * (unsigned long long)state) % RNGMOD);
257111028Sjeff    w->l->rand_seed = state;
258111028Sjeff    return state;
259111028Sjeff}
260105854Sjulian
261111028Sjeffstatic void mysrand(__cilkrts_worker *w, unsigned seed)
262111028Sjeff{
263105854Sjulian    seed %= RNGMOD;
264105854Sjulian    seed += (seed == 0); /* 0 does not belong to the multiplicative
265105854Sjulian                            group.  Use 1 instead */
266105854Sjulian    w->l->rand_seed = seed;
267105854Sjulian}
268105854Sjulian
269105854Sjulian/* W grabs its own lock */
270105854Sjulianvoid __cilkrts_worker_lock(__cilkrts_worker *w)
271105854Sjulian{
272105854Sjulian    validate_worker(w);
273105854Sjulian    CILK_ASSERT(w->l->do_not_steal == 0);
274105854Sjulian
275105854Sjulian    /* tell thieves to stay out of the way */
276105854Sjulian    w->l->do_not_steal = 1;
277111028Sjeff    __cilkrts_fence(); /* probably redundant */
278111028Sjeff
279111028Sjeff    __cilkrts_mutex_lock(w, &w->l->lock);
280111028Sjeff}
281111028Sjeff
282111028Sjeffvoid __cilkrts_worker_unlock(__cilkrts_worker *w)
283111028Sjeff{
284105854Sjulian    __cilkrts_mutex_unlock(w, &w->l->lock);
285111028Sjeff    CILK_ASSERT(w->l->do_not_steal == 1);
286111028Sjeff    /* The fence is probably redundant.  Use a release
287111028Sjeff       operation when supported (gcc and compatibile);
288111028Sjeff       that is faster on x86 which serializes normal stores. */
289111028Sjeff#if defined __GNUC__ && (__GNUC__ * 10 + __GNUC_MINOR__ > 43 || __ICC >= 1110)
290111028Sjeff    __sync_lock_release(&w->l->do_not_steal);
291105854Sjulian#else
292105854Sjulian    w->l->do_not_steal = 0;
293105854Sjulian    __cilkrts_fence(); /* store-store barrier, redundant on x86 */
294105854Sjulian#endif
295105854Sjulian}
296105854Sjulian
297105854Sjulian/* try to acquire the lock of some *other* worker */
298105854Sjulianstatic int worker_trylock_other(__cilkrts_worker *w,
299105854Sjulian                                __cilkrts_worker *other)
300105854Sjulian{
301111028Sjeff    int status = 0;
302111028Sjeff
303111028Sjeff    validate_worker(other);
304111028Sjeff
305105854Sjulian    /* This protocol guarantees that, after setting the DO_NOT_STEAL
306105854Sjulian       flag, worker W can enter its critical section after waiting for
307105854Sjulian       the thief currently in the critical section (if any) and at
308105854Sjulian       most one other thief.
309105854Sjulian
310105854Sjulian       This requirement is overly paranoid, but it should protect us
311105854Sjulian       against future nonsense from OS implementors.
312105854Sjulian    */
313105854Sjulian
314111028Sjeff    /* compete for the right to disturb OTHER */
315111028Sjeff    if (__cilkrts_mutex_trylock(w, &other->l->steal_lock)) {
316111028Sjeff        if (other->l->do_not_steal) {
317111028Sjeff            /* leave it alone */
318111028Sjeff        } else {
319111125Sdavidxu            status = __cilkrts_mutex_trylock(w, &other->l->lock);
320111028Sjeff        }
321111028Sjeff        __cilkrts_mutex_unlock(w, &other->l->steal_lock);
322111028Sjeff    }
323111028Sjeff
324111028Sjeff
325111028Sjeff    return status;
326111028Sjeff}
327111028Sjeff
328111028Sjeffstatic void worker_unlock_other(__cilkrts_worker *w,
329111028Sjeff                                __cilkrts_worker *other)
330111028Sjeff{
331111028Sjeff    __cilkrts_mutex_unlock(w, &other->l->lock);
332111028Sjeff}
333111028Sjeff
334111028Sjeff
335111028Sjeff/* Lock macro Usage:
336111028Sjeff    BEGIN_WITH_WORKER_LOCK(w) {
337111028Sjeff        statement;
338111028Sjeff        statement;
339111028Sjeff        BEGIN_WITH_FRAME_LOCK(w, ff) {
340111028Sjeff            statement;
341111028Sjeff            statement;
342111028Sjeff        } END_WITH_FRAME_LOCK(w, ff);
343111028Sjeff    } END_WITH_WORKER_LOCK(w);
344111028Sjeff */
345111028Sjeff#define BEGIN_WITH_WORKER_LOCK(w) __cilkrts_worker_lock(w); do
346111028Sjeff#define END_WITH_WORKER_LOCK(w)   while (__cilkrts_worker_unlock(w), 0)
347111028Sjeff
348111028Sjeff// TBD(jsukha): These are worker lock acquistions on
349111028Sjeff// a worker whose deque is empty.  My conjecture is that we
350111028Sjeff// do not need to hold the worker lock at these points.
351111028Sjeff// I have left them in for now, however.
352111028Sjeff//
353111028Sjeff// #define REMOVE_POSSIBLY_OPTIONAL_LOCKS
354111028Sjeff#ifdef REMOVE_POSSIBLY_OPTIONAL_LOCKS
355111028Sjeff    #define BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) do
356111028Sjeff    #define END_WITH_WORKER_LOCK_OPTIONAL(w)   while (0)
357111028Sjeff#else
358111028Sjeff    #define BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) __cilkrts_worker_lock(w); do
359111028Sjeff    #define END_WITH_WORKER_LOCK_OPTIONAL(w)   while (__cilkrts_worker_unlock(w), 0)
360111028Sjeff#endif
361111028Sjeff
362111028Sjeff
363111028Sjeff#define BEGIN_WITH_FRAME_LOCK(w, ff)                                     \
36499026Sjulian    do { full_frame *_locked_ff = ff; __cilkrts_frame_lock(w, _locked_ff); do
365111028Sjeff
366111028Sjeff#define END_WITH_FRAME_LOCK(w, ff)                       \
367105854Sjulian    while (__cilkrts_frame_unlock(w, _locked_ff), 0); } while (0)
368105854Sjulian
369105854Sjulian/* W becomes the owner of F and F can be stolen from W */
370111028Sjeffstatic void make_runnable(__cilkrts_worker *w, full_frame *ff)
371105854Sjulian{
372105854Sjulian    w->l->frame_ff = ff;
373105854Sjulian
374105854Sjulian    /* CALL_STACK is invalid (the information is stored implicitly in W) */
375105854Sjulian    ff->call_stack = 0;
376105854Sjulian}
377105854Sjulian
378105854Sjulian/*
379105854Sjulian * The worker parameter is unused, except for print-debugging purposes.
380105854Sjulian */
381105854Sjulianstatic void make_unrunnable(__cilkrts_worker *w,
382105854Sjulian                            full_frame *ff,
383105854Sjulian                            __cilkrts_stack_frame *sf,
384111028Sjeff                            int is_loot,
385111028Sjeff                            const char *why)
386111028Sjeff{
387116963Sdavidxu    /* CALL_STACK becomes valid again */
388111028Sjeff    ff->call_stack = sf;
389111028Sjeff
390105854Sjulian    if (sf) {
391105854Sjulian#if CILK_LIB_DEBUG
392105854Sjulian        if (__builtin_expect(sf->flags & CILK_FRAME_EXITING, 0))
393106180Sdavidxu            __cilkrts_bug("W%d suspending exiting frame %p/%p\n", w->self, ff, sf);
394106180Sdavidxu#endif
395116963Sdavidxu        sf->flags |= CILK_FRAME_STOLEN | CILK_FRAME_SUSPENDED;
396105854Sjulian        sf->worker = 0;
397106242Sdavidxu
398116963Sdavidxu        if (is_loot)
399116963Sdavidxu            __cilkrts_put_stack(ff, sf);
400106242Sdavidxu
401116963Sdavidxu        /* perform any system-dependent action, such as saving the
402116963Sdavidxu           state of the stack */
403106180Sdavidxu        __cilkrts_make_unrunnable_sysdep(w, ff, sf, is_loot, why);
404106180Sdavidxu    }
405116963Sdavidxu}
406116963Sdavidxu
407116963Sdavidxu
408116963Sdavidxu/* Push the next full frame to be made active in this worker and increment its
409116963Sdavidxu * join counter.  __cilkrts_push_next_frame and pop_next_frame work on a
410116963Sdavidxu * one-element queue.  This queue is used to communicate across the runtime
411116963Sdavidxu * from the code that wants to activate a frame to the code that can actually
412116963Sdavidxu * begin execution on that frame.  They are asymetrical in that push
413116963Sdavidxu * increments the join counter but pop does not decrement it.  Rather, a
414116963Sdavidxu * single push/pop combination makes a frame active and increments its join
415116963Sdavidxu * counter once. */
416116963Sdavidxuvoid __cilkrts_push_next_frame(__cilkrts_worker *w, full_frame *ff)
417116963Sdavidxu{
418116963Sdavidxu    CILK_ASSERT(ff);
419116963Sdavidxu    CILK_ASSERT(!w->l->next_frame_ff);
420116963Sdavidxu    incjoin(ff);
421116963Sdavidxu    w->l->next_frame_ff = ff;
422116963Sdavidxu}
423116963Sdavidxu
424116963Sdavidxu/* Get the next full-frame to be made active in this worker.  The join count
425116963Sdavidxu * of the full frame will have been incremented by the corresponding push
426116963Sdavidxu * event.  See __cilkrts_push_next_frame, above.
427116963Sdavidxu */
428116963Sdavidxustatic full_frame *pop_next_frame(__cilkrts_worker *w)
429116963Sdavidxu{
430116963Sdavidxu    full_frame *ff;
431116963Sdavidxu    ff = w->l->next_frame_ff;
432106180Sdavidxu    // Remove the frame from the next_frame field.
433116963Sdavidxu    //
434106180Sdavidxu    // If this is a user worker, then there is a chance that another worker
435116963Sdavidxu    // from our team could push work into our next_frame (if it is the last
436116963Sdavidxu    // worker doing work for this team).  The other worker's setting of the
437105854Sjulian    // next_frame could race with our setting of next_frame to NULL.  This is
438105854Sjulian    // the only possible race condition on next_frame.  However, if next_frame
439111028Sjeff    // has a non-NULL value, then it means the team still has work to do, and
440111028Sjeff    // there is no chance of another team member populating next_frame.  Thus,
441111028Sjeff    // it is safe to set next_frame to NULL, if it was populated.  There is no
442111028Sjeff    // need for an atomic op.
443111028Sjeff    if (NULL != ff) {
444105854Sjulian        w->l->next_frame_ff = NULL;
445105854Sjulian    }
446105854Sjulian    return ff;
447105854Sjulian}
448105854Sjulian
449108640Sdavidxu/*
450115790Sjulian * Identify the single worker that is allowed to cross a sync in this frame.  A
451115790Sjulian * thief should call this function when it is the first to steal work from a
452105854Sjulian * user worker.  "First to steal work" may mean that there has been parallelism
453105854Sjulian * in the user worker before, but the whole team sync'd, and this is the first
454115790Sjulian * steal after that.
455106182Sdavidxu *
456105854Sjulian * This should happen while holding the worker and frame lock.
457115790Sjulian */
458105854Sjulianstatic void set_sync_master(__cilkrts_worker *w, full_frame *ff)
459105854Sjulian{
460115790Sjulian    w->l->last_full_frame = ff;
461115790Sjulian    ff->sync_master = w;
462115790Sjulian}
463115790Sjulian
464115790Sjulian/*
465115790Sjulian * The sync that ends all parallelism for a particular user worker is about to
466105854Sjulian * be crossed.  Decouple the worker and frame.
467105854Sjulian *
468105854Sjulian * No locks need to be held since the user worker isn't doing anything, and none
469105854Sjulian * of the system workers can steal from it.  But unset_sync_master() should be
470115790Sjulian * called before the user worker knows about this work (i.e., before it is
471115790Sjulian * inserted into the w->l->next_frame_ff is set).
472115790Sjulian */
473115790Sjulianstatic void unset_sync_master(__cilkrts_worker *w, full_frame *ff)
474115790Sjulian{
475115790Sjulian    CILK_ASSERT(WORKER_USER == w->l->type);
476115790Sjulian    CILK_ASSERT(ff->sync_master == w);
477115790Sjulian    ff->sync_master = NULL;
478115790Sjulian    w->l->last_full_frame = NULL;
479108640Sdavidxu}
480108640Sdavidxu
481111028Sjeff/********************************************************************
482116361Sdavidxu * THE protocol:
483105854Sjulian ********************************************************************/
484105854Sjulian/*
485105854Sjulian * This is a protocol for work stealing that minimizes the overhead on
486111028Sjeff * the victim.
487111028Sjeff *
488111028Sjeff * The protocol uses three shared pointers into the worker's deque:
489111028Sjeff * - T - the "tail"
490112071Sdavidxu * - H - the "head"
491105854Sjulian * - E - the "exception"  NB: In this case, "exception" has nothing to do
492105854Sjulian * with C++ throw-catch exceptions -- it refers only to a non-normal return,
493105854Sjulian * i.e., a steal or similar scheduling exception.
494106182Sdavidxu *
495105854Sjulian * with H <= E, H <= T.
496105854Sjulian *
497107719Sjulian * Stack frames SF, where H <= E < T, are available for stealing.
498108338Sjulian *
499111028Sjeff * The worker operates on the T end of the stack.  The frame being
500107719Sjulian * worked on is not on the stack.  To make a continuation available for
501111028Sjeff * stealing the worker pushes a from onto the stack: stores *T++ = SF.
502111028Sjeff * To return, it pops the frame off the stack: obtains SF = *--T.
503111169Sdavidxu *
504111028Sjeff * After decrementing T, the condition E > T signals to the victim that
505111028Sjeff * it should invoke the runtime system's "THE" exception handler.  The
506105854Sjulian * pointer E can become INFINITY, in which case the victim must invoke
507111028Sjeff * the THE exception handler as soon as possible.
508105854Sjulian *
509105854Sjulian * See "The implementation of the Cilk-5 multithreaded language", PLDI 1998,
510107719Sjulian * http://portal.acm.org/citation.cfm?doid=277652.277725, for more information
511116401Sdavidxu * on the THE protocol.
512116401Sdavidxu */
513111169Sdavidxu
514116963Sdavidxu/* the infinity value of E */
515111169Sdavidxu#define EXC_INFINITY  ((__cilkrts_stack_frame **) (-1))
516105854Sjulian
517105854Sjulianstatic void increment_E(__cilkrts_worker *victim)
518107719Sjulian{
519116401Sdavidxu    __cilkrts_stack_frame *volatile *tmp;
520107719Sjulian
521111169Sdavidxu    // The currently executing worker must own the worker lock to touch
522111169Sdavidxu    // victim->exc
523111169Sdavidxu    ASSERT_WORKER_LOCK_OWNED(victim);
524111169Sdavidxu
525111169Sdavidxu    tmp = victim->exc;
526116401Sdavidxu    if (tmp != EXC_INFINITY) {
527116401Sdavidxu        /* On most x86 this pair of operations would be slightly faster
528116963Sdavidxu           as an atomic exchange due to the implicit memory barrier in
529116963Sdavidxu           an atomic instruction. */
530116963Sdavidxu        victim->exc = tmp + 1;
531116963Sdavidxu        __cilkrts_fence();
532116963Sdavidxu    }
533116963Sdavidxu}
534116963Sdavidxu
535111169Sdavidxustatic void decrement_E(__cilkrts_worker *victim)
536116963Sdavidxu{
537116963Sdavidxu    __cilkrts_stack_frame *volatile *tmp;
538116963Sdavidxu
539116963Sdavidxu    // The currently executing worker must own the worker lock to touch
540116963Sdavidxu    // victim->exc
541116963Sdavidxu    ASSERT_WORKER_LOCK_OWNED(victim);
542116963Sdavidxu
543116963Sdavidxu    tmp = victim->exc;
544116963Sdavidxu    if (tmp != EXC_INFINITY) {
545116963Sdavidxu        /* On most x86 this pair of operations would be slightly faster
546116963Sdavidxu           as an atomic exchange due to the implicit memory barrier in
547116963Sdavidxu           an atomic instruction. */
548116963Sdavidxu        victim->exc = tmp - 1;
549116963Sdavidxu        __cilkrts_fence(); /* memory fence not really necessary */
550116963Sdavidxu    }
551116963Sdavidxu}
552116963Sdavidxu
553116963Sdavidxu#if 0
554116963Sdavidxu/* for now unused, will be necessary if we implement abort */
555105854Sjulianstatic void signal_THE_exception(__cilkrts_worker *wparent)
556116401Sdavidxu{
557116401Sdavidxu    wparent->exc = EXC_INFINITY;
558116401Sdavidxu    __cilkrts_fence();
559116401Sdavidxu}
560116401Sdavidxu#endif
561107719Sjulian
562105854Sjulianstatic void reset_THE_exception(__cilkrts_worker *w)
563105854Sjulian{
564105854Sjulian    // The currently executing worker must own the worker lock to touch
565105854Sjulian    // w->exc
566105854Sjulian    ASSERT_WORKER_LOCK_OWNED(w);
567105854Sjulian
568105854Sjulian    w->exc = w->head;
569105854Sjulian    __cilkrts_fence();
570105854Sjulian}
571105854Sjulian
572111028Sjeff/* conditions under which victim->head can be stolen: */
573108338Sjulianstatic int can_steal_from(__cilkrts_worker *victim)
574105854Sjulian{
575105854Sjulian    return ((victim->head < victim->tail) &&
576108338Sjulian            (victim->head < victim->protected_tail));
577111028Sjeff}
578105854Sjulian
579116361Sdavidxu/* Return TRUE if the frame can be stolen, false otherwise */
580111028Sjeffstatic int dekker_protocol(__cilkrts_worker *victim)
581111028Sjeff{
582108613Sjulian    // increment_E and decrement_E are going to touch victim->exc.  The
583105854Sjulian    // currently executing worker must own victim's lock before they can
584105854Sjulian    // modify it
585111028Sjeff    ASSERT_WORKER_LOCK_OWNED(victim);
586111207Sdavidxu
587111028Sjeff    /* ASSERT(E >= H); */
588108613Sjulian
589111028Sjeff    increment_E(victim);
590108338Sjulian
591105854Sjulian    /* ASSERT(E >= H + 1); */
592105854Sjulian    if (can_steal_from(victim)) {
593105854Sjulian        /* success, we can steal victim->head and set H <- H + 1
594111028Sjeff           in detach() */
595111028Sjeff        return 1;
596111028Sjeff    } else {
597111028Sjeff        /* failure, restore previous state */
598111028Sjeff        decrement_E(victim);
599108338Sjulian        return 0;
600111028Sjeff    }
601105854Sjulian}
602111028Sjeff
603111028Sjeff
604111028Sjeff/* Link PARENT and CHILD in the spawn tree */
605111028Sjeffstatic full_frame *make_child(__cilkrts_worker *w,
606116963Sdavidxu                              full_frame *parent_ff,
607116963Sdavidxu                              __cilkrts_stack_frame *child_sf,
608116963Sdavidxu                              cilk_fiber *fiber)
609111028Sjeff{
610111028Sjeff    full_frame *child_ff = __cilkrts_make_full_frame(w, child_sf);
611111028Sjeff
612108613Sjulian    child_ff->parent = parent_ff;
613105854Sjulian    push_child(parent_ff, child_ff);
614111028Sjeff
615108338Sjulian    //DBGPRINTF("%d-          make_child - child_frame: %p, parent_frame: %p, child_sf: %p\n"
616108613Sjulian    //    "            parent - parent: %p, left_sibling: %p, right_sibling: %p, rightmost_child: %p\n"
617105854Sjulian    //    "            child  - parent: %p, left_sibling: %p, right_sibling: %p, rightmost_child: %p\n",
618111028Sjeff    //          w->self, child, parent, child_sf,
619108338Sjulian    //          parent->parent, parent->left_sibling, parent->right_sibling, parent->rightmost_child,
620105854Sjulian    //          child->parent, child->left_sibling, child->right_sibling, child->rightmost_child);
621105854Sjulian    CILK_ASSERT(parent_ff->call_stack);
622105854Sjulian    child_ff->is_call_child = (fiber == NULL);
623105854Sjulian
624111028Sjeff    /* PLACEHOLDER_FIBER is used as non-null marker indicating that
625105854Sjulian       child should be treated as a spawn child even though we have not
626105854Sjulian       yet assigned a real fiber to its parent. */
627105854Sjulian    if (fiber == PLACEHOLDER_FIBER)
628105854Sjulian        fiber = NULL; /* Parent actually gets a null fiber, for now */
629105854Sjulian
630105854Sjulian    /* perform any system-dependent actions, such as capturing
631105854Sjulian       parameter passing information */
632105854Sjulian    /*__cilkrts_make_child_sysdep(child, parent);*/
633105854Sjulian
634105854Sjulian    /* Child gets reducer map and stack of parent.
635105854Sjulian       Parent gets a new map and new stack. */
636105854Sjulian    child_ff->fiber_self = parent_ff->fiber_self;
637105854Sjulian    child_ff->sync_master = NULL;
638111028Sjeff
639116401Sdavidxu    if (child_ff->is_call_child) {
640116401Sdavidxu        /* Cause segfault on any attempted access.  The parent gets
641105854Sjulian           the child map and stack when the child completes. */
642105854Sjulian        parent_ff->fiber_self = 0;
643105854Sjulian    } else {
644105854Sjulian        parent_ff->fiber_self = fiber;
645105854Sjulian    }
646111028Sjeff
647111028Sjeff    incjoin(parent_ff);
648111028Sjeff    return child_ff;
649111028Sjeff}
650111028Sjeff
651111028Sjeffstatic inline __cilkrts_stack_frame *__cilkrts_advance_frame(__cilkrts_stack_frame *sf)
652116401Sdavidxu{
653111028Sjeff    __cilkrts_stack_frame *p = sf->call_parent;
654116401Sdavidxu    sf->call_parent = 0;
655116401Sdavidxu    return p;
656116440Sdavidxu}
657116440Sdavidxu
658112078Sdavidxu/* w should be the currently executing worker.
659116401Sdavidxu * loot_sf is the youngest stack frame in the call stack being
660116401Sdavidxu *   unrolled (i.e., the most deeply nested stack frame.)
661116401Sdavidxu *
662116401Sdavidxu * When this method is called for a steal, loot_sf should be on a
663112078Sdavidxu * victim worker which is different from w.
664116401Sdavidxu * For CILK_FORCE_REDUCE, the victim worker will equal w.
665116401Sdavidxu *
666105854Sjulian * Before execution, the __cilkrts_stack_frame's have pointers from
667105854Sjulian * older to younger, i.e., a __cilkrts_stack_frame points to parent.
668111028Sjeff *
669116401Sdavidxu * This method creates a full frame for each __cilkrts_stack_frame in
670107006Sdavidxu * the call stack, with each full frame also pointing to its parent.
671105854Sjulian *
672105854Sjulian * The method returns the full frame created for loot_sf, i.e., the
673105854Sjulian * youngest full frame.
674111028Sjeff */
675105854Sjulianstatic full_frame *unroll_call_stack(__cilkrts_worker *w,
676105854Sjulian                                     full_frame *ff,
677105854Sjulian                                     __cilkrts_stack_frame *const loot_sf)
678111028Sjeff{
679105854Sjulian    __cilkrts_stack_frame *sf = loot_sf;
680105854Sjulian    __cilkrts_stack_frame *rev_sf = 0;
681111028Sjeff    __cilkrts_stack_frame *t_sf;
682111028Sjeff
683111028Sjeff    CILK_ASSERT(sf);
684111677Sdavidxu    /*CILK_ASSERT(sf->call_parent != sf);*/
685111028Sjeff
686111028Sjeff    /* The leafmost frame is unsynched. */
687111677Sdavidxu    if (sf->worker != w)
688111028Sjeff        sf->flags |= CILK_FRAME_UNSYNCHED;
689105854Sjulian
690116452Sdavidxu    /* Reverse the call stack to make a linked list ordered from parent
691116452Sdavidxu       to child.  sf->call_parent points to the child of SF instead of
692111028Sjeff       the parent.  */
693111028Sjeff    do {
694111028Sjeff        t_sf = (sf->flags & (CILK_FRAME_DETACHED|CILK_FRAME_STOLEN|CILK_FRAME_LAST))? 0 : sf->call_parent;
695111028Sjeff        sf->call_parent = rev_sf;
696111028Sjeff        rev_sf = sf;
697111028Sjeff        sf = t_sf;
698111028Sjeff    } while (sf);
699111028Sjeff    sf = rev_sf;
700111028Sjeff
701111028Sjeff    /* Promote each stack frame to a full frame in order from parent
702111028Sjeff       to child, following the reversed list we just built. */
703111028Sjeff    make_unrunnable(w, ff, sf, sf == loot_sf, "steal 1");
704116401Sdavidxu    /* T is the *child* of SF, because we have reversed the list */
705116401Sdavidxu    for (t_sf = __cilkrts_advance_frame(sf); t_sf;
706116401Sdavidxu         sf = t_sf, t_sf = __cilkrts_advance_frame(sf)) {
707116401Sdavidxu        ff = make_child(w, ff, t_sf, NULL);
708116401Sdavidxu        make_unrunnable(w, ff, t_sf, t_sf == loot_sf, "steal 2");
709111028Sjeff    }
710111028Sjeff
711111028Sjeff    /* XXX What if the leafmost frame does not contain a sync
712111028Sjeff       and this steal is from promote own deque? */
713116401Sdavidxu    /*sf->flags |= CILK_FRAME_UNSYNCHED;*/
714116401Sdavidxu
715116401Sdavidxu    CILK_ASSERT(!sf->call_parent);
716116401Sdavidxu    return ff;
717116401Sdavidxu}
718116401Sdavidxu
719116401Sdavidxu/* detach the top of the deque frame from the VICTIM and install a new
720105854Sjulian   CHILD frame in its place */
721111028Sjeffstatic void detach_for_steal(__cilkrts_worker *w,
722105854Sjulian                             __cilkrts_worker *victim,
723111028Sjeff                             cilk_fiber* fiber)
724111028Sjeff{
725105854Sjulian    /* ASSERT: we own victim->lock */
726111028Sjeff
727111028Sjeff    full_frame *parent_ff, *child_ff, *loot_ff;
728111028Sjeff    __cilkrts_stack_frame *volatile *h;
729111028Sjeff    __cilkrts_stack_frame *sf;
730105854Sjulian
731111028Sjeff    w->l->team = victim->l->team;
732111028Sjeff
733111028Sjeff    CILK_ASSERT(w->l->frame_ff == 0 || w == victim);
734111028Sjeff
735111028Sjeff    h = victim->head;
736105854Sjulian
737111028Sjeff    CILK_ASSERT(*h);
738111028Sjeff
739111028Sjeff    victim->head = h + 1;
740111028Sjeff
741111028Sjeff    parent_ff = victim->l->frame_ff;
742111028Sjeff    BEGIN_WITH_FRAME_LOCK(w, parent_ff) {
743111028Sjeff        /* parent no longer referenced by victim */
744111028Sjeff        decjoin(parent_ff);
745111028Sjeff
746111028Sjeff        /* obtain the victim call stack */
747116963Sdavidxu        sf = *h;
748111028Sjeff
749116963Sdavidxu        /* perform system-dependent normalizations */
750111028Sjeff        /*__cilkrts_normalize_call_stack_on_steal(sf);*/
751111028Sjeff
752111028Sjeff        /* unroll PARENT_FF with call stack SF, adopt the youngest
753116963Sdavidxu           frame LOOT.  If loot_ff == parent_ff, then we hold loot_ff->lock,
754116963Sdavidxu           otherwise, loot_ff is newly created and we can modify it without
755116963Sdavidxu           holding its lock. */
756116963Sdavidxu        loot_ff = unroll_call_stack(w, parent_ff, sf);
757116963Sdavidxu
758116963Sdavidxu        #if REDPAR_DEBUG >= 3
759116963Sdavidxu        fprintf(stderr, "[W=%d, victim=%d, desc=detach, parent_ff=%p, loot=%p]\n",
760116963Sdavidxu                w->self, victim->self,
761111028Sjeff                parent_ff, loot_ff);
762112397Sdavidxu        #endif
763112397Sdavidxu
764111028Sjeff        if (WORKER_USER == victim->l->type &&
765111028Sjeff            NULL == victim->l->last_full_frame) {
766111028Sjeff            // Mark this looted frame as special: only the original user worker
767111028Sjeff            // may cross the sync.
768111028Sjeff            //
769111028Sjeff            // This call is a shared access to
770111028Sjeff            // victim->l->last_full_frame.
771111028Sjeff            set_sync_master(victim, loot_ff);
772111028Sjeff        }
773111028Sjeff
774116401Sdavidxu        /* LOOT is the next frame that the thief W is supposed to
775105854Sjulian           run, unless the thief is stealing from itself, in which
776105854Sjulian           case the thief W == VICTIM executes CHILD and nobody
777111028Sjeff           executes LOOT. */
778111028Sjeff        if (w == victim) {
779105854Sjulian            /* Pretend that frame has been stolen */
780111028Sjeff            loot_ff->call_stack->flags |= CILK_FRAME_UNSYNCHED;
781111028Sjeff            loot_ff->simulated_stolen = 1;
782111028Sjeff        }
783116401Sdavidxu        else
784111028Sjeff            __cilkrts_push_next_frame(w, loot_ff);
785111028Sjeff
786111028Sjeff        // After this "push_next_frame" call, w now owns loot_ff.
787111028Sjeff        child_ff = make_child(w, loot_ff, 0, fiber);
788116401Sdavidxu
789111028Sjeff        BEGIN_WITH_FRAME_LOCK(w, child_ff) {
790105854Sjulian            /* install child in the victim's work queue, taking
791116401Sdavidxu               the parent_ff's place */
792116401Sdavidxu            /* child is referenced by victim */
793116401Sdavidxu            incjoin(child_ff);
794116607Sdavidxu
795116607Sdavidxu            // With this call, w is bestowing ownership of the newly
796116607Sdavidxu            // created frame child_ff to the victim, and victim is
797116607Sdavidxu            // giving up ownership of parent_ff.
798116607Sdavidxu            //
799116401Sdavidxu            // Worker w will either take ownership of parent_ff
800116401Sdavidxu            // if parent_ff == loot_ff, or parent_ff will be
801116401Sdavidxu            // suspended.
802116607Sdavidxu            //
803116607Sdavidxu            // Note that this call changes the victim->frame_ff
804111028Sjeff            // while the victim may be executing.
805105854Sjulian            make_runnable(victim, child_ff);
806105854Sjulian        } END_WITH_FRAME_LOCK(w, child_ff);
807105854Sjulian    } END_WITH_FRAME_LOCK(w, parent_ff);
808105854Sjulian}
80999026Sjulian
81099026Sjulian/**
81199026Sjulian * @brief cilk_fiber_proc that resumes user code after a successful
81299026Sjulian * random steal.
81399026Sjulian
81499026Sjulian * This function longjmps back into the user code whose state is
815107126Sjeff * stored in cilk_fiber_get_data(fiber)->resume_sf.  The stack pointer
81699026Sjulian * is adjusted so that the code resumes on the specified fiber stack
81799026Sjulian * instead of its original stack.
818107126Sjeff *
819107126Sjeff * This method gets executed only on a fiber freshly allocated from a
820103367Sjulian * pool.
821107126Sjeff *
822107126Sjeff * @param fiber   The fiber being used to resume user code.
823103367Sjulian * @param arg     Unused.
824111028Sjeff */
825111028Sjeffstatic
82699026Sjulianvoid fiber_proc_to_resume_user_code_for_random_steal(cilk_fiber *fiber)
82799026Sjulian{
82899026Sjulian    cilk_fiber_data *data = cilk_fiber_get_data(fiber);
829103002Sjulian    __cilkrts_stack_frame* sf = data->resume_sf;
83099026Sjulian    full_frame *ff;
83199026Sjulian
83299026Sjulian    CILK_ASSERT(sf);
83399026Sjulian
834111028Sjeff    // When we pull the resume_sf out of the fiber to resume it, clear
83599026Sjulian    // the old value.
836111028Sjeff    data->resume_sf = NULL;
83799026Sjulian    CILK_ASSERT(sf->worker == data->owner);
83899026Sjulian    ff = sf->worker->l->frame_ff;
839103410Smini
840105854Sjulian    // For Win32, we need to overwrite the default exception handler
841105854Sjulian    // in this function, so that when the OS exception handling code
842105854Sjulian    // walks off the top of the current Cilk stack, it reaches our stub
843105854Sjulian    // handler.
844105854Sjulian
845111028Sjeff    // Also, this function needs to be wrapped into a try-catch block
846105854Sjulian    // so the compiler generates the appropriate exception information
847111028Sjeff    // in this frame.
848105854Sjulian
849105854Sjulian    // TBD: IS THIS HANDLER IN THE WRONG PLACE?  Can we longjmp out of
850105854Sjulian    // this function (and does it matter?)
851111028Sjeff#if defined(_WIN32) && !defined(_WIN64)
852111028Sjeff    install_exception_stub_handler();
853111028Sjeff    __try
854111028Sjeff#endif
855111028Sjeff    {
856111028Sjeff        char* new_sp = sysdep_reset_jump_buffers_for_resume(fiber, ff, sf);
857111028Sjeff
858111028Sjeff        // Notify the Intel tools that we're stealing code
859111028Sjeff        ITT_SYNC_ACQUIRED(sf->worker);
860111028Sjeff        NOTIFY_ZC_INTRINSIC("cilk_continue", sf);
861111028Sjeff
862111028Sjeff        // TBD: We'd like to move TBB-interop methods into the fiber
863105854Sjulian        // eventually.
864105854Sjulian        cilk_fiber_invoke_tbb_stack_op(fiber, CILK_TBB_STACK_ADOPT);
865105854Sjulian
866105854Sjulian        sf->flags &= ~CILK_FRAME_SUSPENDED;
867105854Sjulian
868111028Sjeff        // longjmp to user code.  Don't process exceptions here,
869105854Sjulian        // because we are resuming a stolen frame.
870111028Sjeff        sysdep_longjmp_to_sf(new_sp, sf, NULL);
871105854Sjulian        /*NOTREACHED*/
872105854Sjulian        // Intel's C compiler respects the preceding lint pragma
873105854Sjulian    }
874111028Sjeff#if defined(_WIN32) && !defined(_WIN64)
87599026Sjulian    __except (CILK_ASSERT(!"should not execute the the stub filter"),
87699026Sjulian              EXCEPTION_EXECUTE_HANDLER)
87799026Sjulian    {
87899026Sjulian        // If we are here, that means something very wrong
879105854Sjulian        // has happened in our exception processing...
880105854Sjulian        CILK_ASSERT(! "should not be here!");
881105854Sjulian    }
882111028Sjeff#endif
88399026Sjulian}
88499026Sjulian
885111028Sjeffstatic void random_steal(__cilkrts_worker *w)
886111028Sjeff{
88799026Sjulian    __cilkrts_worker *victim = NULL;
888105854Sjulian    cilk_fiber *fiber = NULL;
889105854Sjulian    int n;
890111028Sjeff    int success = 0;
891111028Sjeff    int32_t victim_id;
892111028Sjeff
893105854Sjulian    // Nothing's been stolen yet. When true, this will flag
894105854Sjulian    // setup_for_execution_pedigree to increment the pedigree
895105854Sjulian    w->l->work_stolen = 0;
896111028Sjeff
897105854Sjulian    /* If the user has disabled stealing (using the debugger) we fail */
898105854Sjulian    if (__builtin_expect(w->g->stealing_disabled, 0))
899105854Sjulian        return;
900105854Sjulian
901105854Sjulian    CILK_ASSERT(w->l->type == WORKER_SYSTEM || w->l->team == w);
902105854Sjulian
903111028Sjeff    /* If there is only one processor work can still be stolen.
904111028Sjeff       There must be only one worker to prevent stealing. */
905111028Sjeff    CILK_ASSERT(w->g->total_workers > 1);
906105854Sjulian
907105854Sjulian    /* pick random *other* victim */
908111028Sjeff    n = myrand(w) % (w->g->total_workers - 1);
909111028Sjeff    if (n >= w->self)
910105854Sjulian        ++n;
911105854Sjulian
91299026Sjulian    // If we're replaying a log, override the victim.  -1 indicates that
913105854Sjulian    // we've exhausted the list of things this worker stole when we recorded
914105854Sjulian    // the log so just return.  If we're not replaying a log,
915105854Sjulian    // replay_get_next_recorded_victim() just returns the victim ID passed in.
916105854Sjulian    n = replay_get_next_recorded_victim(w, n);
917105854Sjulian    if (-1 == n)
918105854Sjulian        return;
919105854Sjulian
920105854Sjulian    victim = w->g->workers[n];
921105854Sjulian
922105854Sjulian    START_INTERVAL(w, INTERVAL_FIBER_ALLOCATE) {
923111028Sjeff        /* Verify that we can get a stack.  If not, no need to continue. */
924111028Sjeff        fiber = cilk_fiber_allocate(&w->l->fiber_pool);
925111028Sjeff    } STOP_INTERVAL(w, INTERVAL_FIBER_ALLOCATE);
926111028Sjeff
927111028Sjeff
92899026Sjulian    if (NULL == fiber) {
92999026Sjulian#if FIBER_DEBUG >= 2
93099026Sjulian        fprintf(stderr, "w=%d: failed steal because we could not get a fiber\n",
93199026Sjulian                w->self);
932103367Sjulian#endif
933103367Sjulian        return;
934103367Sjulian    }
935103367Sjulian
936103367Sjulian    /* do not steal from self */
937111119Simp    CILK_ASSERT (victim != w);
938103367Sjulian
939103367Sjulian    /* Execute a quick check before engaging in the THE protocol.
940103367Sjulian       Avoid grabbing locks if there is nothing to steal. */
941103367Sjulian    if (!can_steal_from(victim)) {
942103367Sjulian        NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_EMPTYQ);
943103367Sjulian        START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) {
944103367Sjulian            int ref_count = cilk_fiber_remove_reference(fiber, &w->l->fiber_pool);
945103367Sjulian            // Fibers we use when trying to steal should not be active,
946111119Simp            // and thus should not have any other references.
947103367Sjulian            CILK_ASSERT(0 == ref_count);
948103367Sjulian        } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE);
949103367Sjulian        return;
95099026Sjulian    }
95199026Sjulian
95299026Sjulian    /* Attempt to steal work from the victim */
95399026Sjulian    if (worker_trylock_other(w, victim)) {
95499026Sjulian        if (w->l->type == WORKER_USER && victim->l->team != w) {
95599026Sjulian
956111119Simp            // Fail to steal if this is a user worker and the victim is not
95799026Sjulian            // on this team.  If a user worker were allowed to steal work
95899026Sjulian            // descended from another user worker, the former might not be
95999026Sjulian            // done with its work by the time it was needed to resume and
960103367Sjulian            // unbind.  Therefore, user workers are not permitted to change
961103367Sjulian            // teams.
962103367Sjulian
963103367Sjulian            // There is no race on the victim's team because the victim cannot
964103367Sjulian            // change its team until it runs out of work to do, at which point
965103367Sjulian            // it will try to take out its own lock, and this worker already
966103367Sjulian            // holds it.
967103367Sjulian            NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_USER_WORKER);
968103367Sjulian
969103367Sjulian        } else if (victim->l->frame_ff) {
970103367Sjulian            // A successful steal will change victim->frame_ff, even
971103367Sjulian            // though the victim may be executing.  Thus, the lock on
972103367Sjulian            // the victim's deque is also protecting victim->frame_ff.
973103367Sjulian            if (dekker_protocol(victim)) {
974103367Sjulian                int proceed_with_steal = 1; // optimistic
975103367Sjulian
976103367Sjulian                // If we're replaying a log, verify that this the correct frame
977103367Sjulian                // to steal from the victim
97899026Sjulian                if (! replay_match_victim_pedigree(w, victim))
97999026Sjulian                {
98099026Sjulian                    // Abort the steal attempt. decrement_E(victim) to
98199026Sjulian                    // counter the increment_E(victim) done by the
98299026Sjulian                    // dekker protocol
983107719Sjulian                    decrement_E(victim);
984107719Sjulian                    proceed_with_steal = 0;
98599026Sjulian                }
98699026Sjulian
98799026Sjulian                if (proceed_with_steal)
98899026Sjulian                {
98999026Sjulian                    START_INTERVAL(w, INTERVAL_STEAL_SUCCESS) {
990104031Sjulian                        success = 1;
991104031Sjulian                        detach_for_steal(w, victim, fiber);
99299026Sjulian                        victim_id = victim->self;
99399026Sjulian
99499026Sjulian                        #if REDPAR_DEBUG >= 1
99599026Sjulian                        fprintf(stderr, "Wkr %d stole from victim %d, fiber = %p\n",
996104503Sjmallett                                w->self, victim->self, fiber);
997104031Sjulian                        #endif
998104031Sjulian
999104031Sjulian                        // The use of victim->self contradicts our
1000116963Sdavidxu                        // classification of the "self" field as
1001115790Sjulian                        // local.  But since this code is only for
100299026Sjulian                        // debugging, it is ok.
1003104503Sjmallett                        DBGPRINTF ("%d-%p: Stealing work from worker %d\n"
1004104503Sjmallett                            "            sf: %p, call parent: %p\n",
1005104503Sjmallett                            w->self, GetCurrentFiber(), victim->self,
1006104031Sjulian                            w->l->next_frame_ff->call_stack,
1007115790Sjulian                            w->l->next_frame_ff->call_stack->call_parent);
1008115790Sjulian                    } STOP_INTERVAL(w, INTERVAL_STEAL_SUCCESS);
1009115790Sjulian                }  // end if(proceed_with_steal)
1010115790Sjulian            } else {
1011108338Sjulian                NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_DEKKER);
1012104031Sjulian            }
1013111028Sjeff        } else {
1014111028Sjeff            NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_EMPTYQ);
1015117000Smarcel        }
1016117000Smarcel        worker_unlock_other(w, victim);
1017115790Sjulian    } else {
1018111028Sjeff        NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_LOCK);
1019115790Sjulian    }
1020111028Sjeff
1021116963Sdavidxu    // Record whether work was stolen.  When true, this will flag
1022116963Sdavidxu    // setup_for_execution_pedigree to increment the pedigree
1023116963Sdavidxu    w->l->work_stolen = success;
1024116963Sdavidxu
1025116963Sdavidxu    if (0 == success) {
1026116963Sdavidxu        // failed to steal work.  Return the fiber to the pool.
1027116963Sdavidxu        START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) {
1028116963Sdavidxu            int ref_count = cilk_fiber_remove_reference(fiber, &w->l->fiber_pool);
1029116963Sdavidxu            // Fibers we use when trying to steal should not be active,
1030116963Sdavidxu            // and thus should not have any other references.
1031116963Sdavidxu            CILK_ASSERT(0 == ref_count);
1032116963Sdavidxu        } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE);
1033116963Sdavidxu    }
1034116963Sdavidxu    else
1035116963Sdavidxu    {
1036116963Sdavidxu        // Since our steal was successful, finish initialization of
1037116963Sdavidxu        // the fiber.
1038111028Sjeff        cilk_fiber_reset_state(fiber,
1039104031Sjulian                               fiber_proc_to_resume_user_code_for_random_steal);
1040104031Sjulian        // Record the pedigree of the frame that w has stolen.
1041104031Sjulian        // record only if CILK_RECORD_LOG is set
1042104031Sjulian        replay_record_steal(w, victim_id);
1043104031Sjulian    }
1044104031Sjulian}
1045104031Sjulian
1046104031Sjulian
1047108338Sjulian
1048107034Sdavidxu/**
1049104031Sjulian * At a provably good steal, we need to transfer the child reducer map
1050104126Sjulian * from ff->children_reducer_map into v->reducer_map, where v is the
1051104031Sjulian * worker that resumes execution of ff.
1052104031Sjulian *
1053111028Sjeff * Normally, we have v == w, where w is the currently executing
1054111028Sjeff * worker.  In the case where we are resuming a team leader on a user
1055111028Sjeff * worker, however, v might differ from w.
1056111028Sjeff
1057111028Sjeff * Thus, this, operation is a no-op, since we can't really move
1058111028Sjeff * ff->children_reducer_map into w here.
1059111028Sjeff *
1060104126Sjulian * Instead, this work is done in setup_for_execution_reducers().
1061104031Sjulian */
1062104031Sjulianstatic inline void provably_good_steal_reducers(__cilkrts_worker *w,
1063104126Sjulian                                                full_frame       *ff)
1064104031Sjulian{
1065111028Sjeff    // No-op.
1066104031Sjulian}
1067107034Sdavidxu
1068107034Sdavidxu/* at a provably good steal, incorporate the accumulated exceptions of
1069107034Sdavidxu   children into the parent's exception */
1070107034Sdavidxustatic void provably_good_steal_exceptions(__cilkrts_worker *w,
1071107034Sdavidxu                                           full_frame       *ff)
1072111028Sjeff{
1073111028Sjeff    // ASSERT: we own ff->lock
1074111028Sjeff    ff->pending_exception =
1075108338Sjulian        __cilkrts_merge_pending_exceptions(w,
1076104031Sjulian                                           ff->child_pending_exception,
107799026Sjulian                                           ff->pending_exception);
1078104031Sjulian    ff->child_pending_exception = NULL;
1079104031Sjulian}
1080111028Sjeff
1081104031Sjulian/* At sync discard the frame's old stack and take the leftmost child's. */
1082104031Sjulianstatic void provably_good_steal_stacks(__cilkrts_worker *w, full_frame *ff)
1083111028Sjeff{
1084104031Sjulian    CILK_ASSERT(NULL == ff->fiber_self);
1085104126Sjulian    ff->fiber_self = ff->fiber_child;
1086104031Sjulian    ff->fiber_child = NULL;
1087104031Sjulian}
1088104031Sjulian
1089111028Sjeffstatic void __cilkrts_mark_synched(full_frame *ff)
1090104031Sjulian{
1091104031Sjulian    ff->call_stack->flags &= ~CILK_FRAME_UNSYNCHED;
1092104031Sjulian    ff->simulated_stolen = 0;
1093104126Sjulian}
1094104126Sjulian
1095104126Sjulianstatic
1096104031Sjulianenum provably_good_steal_t provably_good_steal(__cilkrts_worker *w,
1097104031Sjulian                                               full_frame       *ff)
1098104126Sjulian{
1099104031Sjulian    // ASSERT: we hold w->lock and ff->lock
1100104031Sjulian
1101104126Sjulian    enum provably_good_steal_t result = ABANDON_EXECUTION;
1102104031Sjulian
1103104031Sjulian    // If the current replay entry is a sync record matching the worker's
1104104126Sjulian    // pedigree, AND this isn't the last child to the sync, return
110599026Sjulian    // WAIT_FOR_CONTINUE to indicate that the caller should loop until
1106104031Sjulian    // we find the right frame to steal and CONTINUE_EXECUTION is returned.
110799026Sjulian    int match_found = replay_match_sync_pedigree(w);
110899026Sjulian    if (match_found && (0 != simulate_decjoin(ff)))
110999026Sjulian        return WAIT_FOR_CONTINUE;
1110107034Sdavidxu
1111107034Sdavidxu    START_INTERVAL(w, INTERVAL_PROVABLY_GOOD_STEAL) {
1112107034Sdavidxu        if (decjoin(ff) == 0) {
1113111028Sjeff            provably_good_steal_reducers(w, ff);
1114107034Sdavidxu            provably_good_steal_exceptions(w, ff);
1115107034Sdavidxu            provably_good_steal_stacks(w, ff);
1116116401Sdavidxu            __cilkrts_mark_synched(ff);
1117107034Sdavidxu
1118116401Sdavidxu            // If the original owner wants this frame back (to resume
1119116401Sdavidxu            // it on its original thread) pass it back now.
1120107034Sdavidxu            if (NULL != ff->sync_master) {
1121107034Sdavidxu                // The frame wants to go back and be executed by the original
1122111976Sdavidxu                // user thread.  We can throw caution to the wind and push the
1123111032Sjulian                // frame straight onto its queue because the only way we have
1124111976Sdavidxu                // gotten to this point of being able to continue execution of
1125111028Sjeff                // the frame is if the original user worker is spinning without
1126107034Sdavidxu                // work.
1127107034Sdavidxu
1128111028Sjeff                unset_sync_master(w->l->team, ff);
1129111028Sjeff                __cilkrts_push_next_frame(w->l->team, ff);
1130111028Sjeff
1131111028Sjeff                // If this is the team leader we're not abandoning the work
1132111028Sjeff                if (w == w->l->team)
1133111028Sjeff                    result = CONTINUE_EXECUTION;
1134111028Sjeff            } else {
1135111028Sjeff                __cilkrts_push_next_frame(w, ff);
1136111028Sjeff                result = CONTINUE_EXECUTION;  // Continue working on this thread
1137107034Sdavidxu            }
1138111028Sjeff
1139107034Sdavidxu            // The __cilkrts_push_next_frame() call changes ownership
1140107034Sdavidxu            // of ff to the specified worker.
1141111028Sjeff        }
1142111515Sdavidxu    } STOP_INTERVAL(w, INTERVAL_PROVABLY_GOOD_STEAL);
1143111028Sjeff
1144107034Sdavidxu    // Only write a SYNC record if:
1145111515Sdavidxu    // - We're recording a log *AND*
1146107034Sdavidxu    // - We're the worker continuing from this sync
1147107034Sdavidxu    replay_record_sync(w, result == CONTINUE_EXECUTION);
1148107034Sdavidxu
1149111028Sjeff    // If we're replaying a log, and matched a sync from the log, mark the
1150112397Sdavidxu    // sync record seen if the sync isn't going to be abandoned.
1151107034Sdavidxu    replay_advance_from_sync (w, match_found, result == CONTINUE_EXECUTION);
1152111028Sjeff
1153107034Sdavidxu    return result;
1154111028Sjeff}
1155111028Sjeff
1156111028Sjeffstatic void unconditional_steal(__cilkrts_worker *w,
1157111028Sjeff                                full_frame *ff)
1158107034Sdavidxu{
1159111028Sjeff    // ASSERT: we hold ff->lock
1160111515Sdavidxu
1161111515Sdavidxu    START_INTERVAL(w, INTERVAL_UNCONDITIONAL_STEAL) {
1162111515Sdavidxu        decjoin(ff);
1163111515Sdavidxu        __cilkrts_push_next_frame(w, ff);
1164111515Sdavidxu    } STOP_INTERVAL(w, INTERVAL_UNCONDITIONAL_STEAL);
1165111515Sdavidxu}
1166111515Sdavidxu
1167111515Sdavidxu
1168111515Sdavidxu/* CHILD is about to die.  Give its exceptions to a sibling or to the
1169107034Sdavidxu   parent.  */
1170117000Smarcelstatic inline void splice_exceptions_for_call(__cilkrts_worker *w,
1171111028Sjeff                                              full_frame *parent_ff,
1172111028Sjeff                                              full_frame *child_ff)
1173111028Sjeff{
1174111028Sjeff    // ASSERT: We own parent_ff->lock
1175111028Sjeff    CILK_ASSERT(child_ff->is_call_child);
1176107034Sdavidxu    CILK_ASSERT(NULL == child_ff->right_pending_exception);
1177112397Sdavidxu    CILK_ASSERT(NULL == parent_ff->pending_exception);
1178112397Sdavidxu
1179112397Sdavidxu    parent_ff->pending_exception = child_ff->pending_exception;
1180112397Sdavidxu    child_ff->pending_exception = NULL;
1181112397Sdavidxu}
1182112397Sdavidxu
1183111028Sjeff/**
1184111028Sjeff * Merge exceptions for a dying child.
1185111028Sjeff *
1186111028Sjeff * @param w                   The currently executing worker.
118799026Sjulian * @param ff                  The child frame that is dying.
118899026Sjulian * @param left_exception_ptr  Pointer to the exception that is to our left.
118999026Sjulian */
1190107719Sjulianstatic inline
1191107719Sjulianvoid splice_exceptions_for_spawn(__cilkrts_worker *w,
1192107719Sjulian                                 full_frame *ff,
119399026Sjulian                                 struct pending_exception_info **left_exception_ptr)
119499026Sjulian{
119599026Sjulian    // ASSERT: parent_ff == child_ff->parent.
119699026Sjulian    // ASSERT: We own parent_ff->lock
119799026Sjulian
119899026Sjulian    // Merge current exception into the slot where the left
119999026Sjulian    // exception should go.
120099026Sjulian    *left_exception_ptr =
120199026Sjulian        __cilkrts_merge_pending_exceptions(w,
120299026Sjulian                                           *left_exception_ptr,
120399026Sjulian                                           ff->pending_exception);
120499026Sjulian    ff->pending_exception = NULL;
120599026Sjulian
120699026Sjulian
120799026Sjulian    // Merge right exception into the slot where the left exception
1208102581Sjulian    // should go.
1209102581Sjulian    *left_exception_ptr =
1210102581Sjulian        __cilkrts_merge_pending_exceptions(w,
121199026Sjulian                                           *left_exception_ptr,
121299026Sjulian                                           ff->right_pending_exception);
121399026Sjulian    ff->right_pending_exception = NULL;
121499026Sjulian}
1215104695Sjulian
1216104695Sjulian
1217104695Sjulianstatic inline void splice_stacks_for_call(__cilkrts_worker *w,
1218104695Sjulian                                          full_frame *parent_ff,
1219104695Sjulian                                          full_frame *child_ff)
122099026Sjulian{
122199026Sjulian#if CILK_LIB_DEBUG
1222102581Sjulian    if (parent_ff->call_stack)
1223103002Sjulian        CILK_ASSERT(!(parent_ff->call_stack->flags & CILK_FRAME_MBZ));
1224103002Sjulian#endif
1225103002Sjulian
1226102581Sjulian    /* A synched frame does not have accumulated child reducers. */
1227103002Sjulian    CILK_ASSERT(!child_ff->fiber_child);
1228113641Sjulian    CILK_ASSERT(child_ff->is_call_child);
1229111115Sdavidxu
1230111115Sdavidxu    /* An attached parent has no self fiber.  It may have
1231103002Sjulian       accumulated child fibers or child owners, which should be
1232103002Sjulian       ignored until sync. */
1233103002Sjulian    CILK_ASSERT(!parent_ff->fiber_self);
1234103002Sjulian    parent_ff->fiber_self = child_ff->fiber_self;
1235103002Sjulian    child_ff->fiber_self = NULL;
1236103002Sjulian}
1237103002Sjulian
1238103216Sjulianstatic void finalize_child_for_call(__cilkrts_worker *w,
1239103002Sjulian                                    full_frame *parent_ff,
124099026Sjulian                                    full_frame *child_ff)
1241104695Sjulian{
1242111028Sjeff    // ASSERT: we hold w->lock and parent_ff->lock
1243111028Sjeff
1244111028Sjeff    START_INTERVAL(w, INTERVAL_FINALIZE_CHILD) {
1245111028Sjeff        CILK_ASSERT(child_ff->is_call_child);
1246111028Sjeff        CILK_ASSERT(child_ff->join_counter == 0);
1247111028Sjeff        CILK_ASSERT(!child_ff->rightmost_child);
1248111028Sjeff        CILK_ASSERT(child_ff == parent_ff->rightmost_child);
1249111028Sjeff
1250111028Sjeff        // CHILD is about to die.
1251111028Sjeff        // Splicing out reducers is a no-op for a call since
1252111028Sjeff        // w->reducer_map should already store the correct
1253104695Sjulian        // reducer map.
1254111028Sjeff
1255104695Sjulian        // ASSERT there are no maps left to reduce.
1256108338Sjulian        CILK_ASSERT(NULL == child_ff->children_reducer_map);
1257104695Sjulian        CILK_ASSERT(NULL == child_ff->right_reducer_map);
1258111028Sjeff
1259105854Sjulian        splice_exceptions_for_call(w, parent_ff, child_ff);
1260111028Sjeff
1261105854Sjulian        splice_stacks_for_call(w, parent_ff, child_ff);
1262105854Sjulian
1263111028Sjeff        /* remove CHILD from list of children of PARENT */
1264105854Sjulian        unlink_child(parent_ff, child_ff);
1265113244Sdavidxu
1266105854Sjulian        /* continue with the parent. */
1267113244Sdavidxu        unconditional_steal(w, parent_ff);
1268105854Sjulian        __cilkrts_destroy_full_frame(w, child_ff);
1269105854Sjulian    } STOP_INTERVAL(w, INTERVAL_FINALIZE_CHILD);
1270107719Sjulian}
1271103002Sjulian
1272103002Sjulian
127399026Sjulian/**
1274112888Sjeff * The invariant on ff->children_reducer_map is that when ff is
1275112993Speter * synched and when we are about to resume execution of ff, at least
1276115084Smarcel * one of ff->children_reducer_map and w->reducer_map must be NULL.
1277112993Speter *
1278112993Speter * Consider the two possibilities before resuming execution of ff:
127999026Sjulian *
1280112993Speter * 1.  Suppose ff is synched and suspended.  Then either
1281112993Speter *
128299026Sjulian *     (a) ff->children_reducer_map stores the reducer map that w
128399026Sjulian *         should use, where w is the worker resuming execution of ff,
128499026Sjulian *         OR
1285107719Sjulian *     (b) w already has a user map, and ff->children_reducer_map is NULL.
1286107719Sjulian *
1287107719Sjulian *     Case (a) happens when we are resuming execution of ff as a
1288107719Sjulian *     provably good steal.  In this case, w->reducer_map should be
1289107719Sjulian *     NULL and ff->children_reducer_map is valid.  To resume
1290107719Sjulian *     execution of ff on w, set w->reducer_map to
1291107719Sjulian *     ff->children_reducer_map.
1292107719Sjulian *
1293107719Sjulian *     Case (b) occurs when we resume execution of ff because ff is a
1294107719Sjulian *     called child.  Then, ff->children_reducer_map should be NULL,
1295107719Sjulian *     and w should already have a valid reducer map when resuming
1296107719Sjulian *     execution of ff.  We resume execution of ff without changing
1297107719Sjulian *     w->reducer_map.
1298107719Sjulian *
1299107719Sjulian * 2. Suppose frame ff is not synched (i.e., it is active and might have
1300107719Sjulian *    active children).   Then ff->children_reducer_map is the slot for
1301107719Sjulian *    storing the reducer map from ff's leftmost child, as in the reducer
1302107719Sjulian *    protocol.   The runtime may resume execution of ff while it is not
1303107719Sjulian *    synched only because of a steal.
1304107719Sjulian *    In this case, while we are resuming ff, ff->children_reducer_map
1305107719Sjulian *    may be non-NULL (because one of ff's children has completed).
130699026Sjulian *    We resume execution of ff without changing w->reducer_map.
130799026Sjulian */
1308103002Sjulianstatic void setup_for_execution_reducers(__cilkrts_worker *w,
1309103002Sjulian                                         full_frame *ff)
131099026Sjulian{
131199026Sjulian    // We only need to move ff->children_reducer_map into
131299026Sjulian    // w->reducer_map in case 1(a).
131399026Sjulian    //
131499026Sjulian    // First check whether ff is synched.
131599026Sjulian    __cilkrts_stack_frame *sf = ff->call_stack;
131699026Sjulian    if (!(sf->flags & CILK_FRAME_UNSYNCHED)) {
131799026Sjulian        // In this case, ff is synched. (Case 1).
131899026Sjulian        CILK_ASSERT(!ff->rightmost_child);
131999026Sjulian
1320111028Sjeff        // Test whether we are in case 1(a) and have
1321111028Sjeff        // something to do.  Note that if both
1322111028Sjeff        // ff->children_reducer_map and w->reducer_map are NULL, we
1323111028Sjeff        // can't distinguish between cases 1(a) and 1(b) here.
1324111028Sjeff        if (ff->children_reducer_map) {
1325111028Sjeff            // We are in Case 1(a).
132699026Sjulian            CILK_ASSERT(!w->reducer_map);
1327103002Sjulian            w->reducer_map = ff->children_reducer_map;
1328103002Sjulian            ff->children_reducer_map = NULL;
132999026Sjulian        }
133099026Sjulian    }
133199026Sjulian}
133299026Sjulian
133399026Sjulianstatic void setup_for_execution_exceptions(__cilkrts_worker *w,
133499026Sjulian                                           full_frame *ff)
1335113641Sjulian{
1336113641Sjulian    CILK_ASSERT(NULL == w->l->pending_exception);
1337113641Sjulian    w->l->pending_exception = ff->pending_exception;
1338113641Sjulian    ff->pending_exception = NULL;
1339113641Sjulian}
1340113920Sjhb
1341113920Sjhb#if 0 /* unused */
1342113641Sjulianstatic void setup_for_execution_stack(__cilkrts_worker *w,
1343113641Sjulian                                      full_frame *ff)
1344113641Sjulian{
1345113641Sjulian}
1346113641Sjulian#endif
1347113641Sjulian
1348113641Sjulian/*
1349111028Sjeff * setup_for_execution_pedigree
1350111028Sjeff *
1351111028Sjeff * Copies the pedigree information from the frame we're resuming to the
1352111028Sjeff * worker.  Increments the pedigree if this is work that has been stolen
1353113864Sjhb * to match the increment on a return from a spawn helper.
1354111028Sjeff */
1355111028Sjeffstatic void setup_for_execution_pedigree(__cilkrts_worker *w)
1356111028Sjeff{
1357111028Sjeff    int pedigree_unsynched;
1358111028Sjeff    __cilkrts_stack_frame *sf = w->current_stack_frame;
1359111028Sjeff
1360111028Sjeff    CILK_ASSERT(NULL != sf);
1361111028Sjeff
1362111028Sjeff    // If this isn't an ABI 1 or later frame, there's no pedigree information
1363111028Sjeff    if (0 == CILK_FRAME_VERSION_VALUE(sf->flags))
1364111028Sjeff        return;
1365111028Sjeff
1366111028Sjeff    // Note whether the pedigree is unsynched and clear the flag before
1367111028Sjeff    // we forget
1368111028Sjeff    pedigree_unsynched = sf->flags & CILK_FRAME_SF_PEDIGREE_UNSYNCHED;
1369111028Sjeff    sf->flags &= ~CILK_FRAME_SF_PEDIGREE_UNSYNCHED;
1370111028Sjeff
1371111028Sjeff    // If we're just marshalling onto this worker, do not increment
1372111028Sjeff    // the rank since that wouldn't happen in a sequential execution
1373111028Sjeff    if (w->l->work_stolen || pedigree_unsynched)
1374111028Sjeff    {
1375111028Sjeff        if (w->l->work_stolen)
1376111028Sjeff            w->pedigree.rank = sf->parent_pedigree.rank + 1;
1377111028Sjeff        else
1378113864Sjhb            w->pedigree.rank = sf->parent_pedigree.rank;
1379105854Sjulian    }
1380105854Sjulian
1381105854Sjulian    w->pedigree.parent = sf->parent_pedigree.parent;
1382111028Sjeff    w->l->work_stolen = 0;
1383105854Sjulian}
1384105854Sjulian
1385105854Sjulianstatic void setup_for_execution(__cilkrts_worker *w,
1386105854Sjulian                                full_frame *ff,
1387105854Sjulian                                int is_return_from_call)
1388111028Sjeff{
1389111028Sjeff    // ASSERT: We own w->lock and ff->lock || P == 1
1390111028Sjeff
1391111028Sjeff    setup_for_execution_reducers(w, ff);
1392111028Sjeff    setup_for_execution_exceptions(w, ff);
1393111028Sjeff    /*setup_for_execution_stack(w, ff);*/
1394111028Sjeff
1395111028Sjeff    ff->call_stack->worker = w;
1396111028Sjeff    w->current_stack_frame = ff->call_stack;
1397111028Sjeff
1398111028Sjeff    // If this is a return from a call, leave the pedigree alone
1399111028Sjeff    if (! is_return_from_call)
1400111028Sjeff        setup_for_execution_pedigree(w);
1401111028Sjeff
1402105854Sjulian    __cilkrts_setup_for_execution_sysdep(w, ff);
1403111028Sjeff
1404111028Sjeff    w->head = w->tail = w->l->ltq;
1405111028Sjeff    reset_THE_exception(w);
1406111028Sjeff
1407111028Sjeff    make_runnable(w, ff);
1408111028Sjeff}
1409111028Sjeff
1410105854Sjulian
1411105854Sjulian/*
1412105854Sjulian * Called by the scheduling fiber, right before
1413105854Sjulian * resuming a sf/ff for user code.
1414105854Sjulian *
1415105854Sjulian * This method associates the specified sf with the worker.
1416111028Sjeff *
1417111028Sjeff * It also asserts that w, ff, and sf all have the expected properties
1418111028Sjeff * for resuming user code.
1419111028Sjeff */
1420111028Sjeffvoid scheduling_fiber_prepare_to_resume_user_code(__cilkrts_worker *w,
1421111028Sjeff                                                  full_frame *ff,
1422111028Sjeff                                                  __cilkrts_stack_frame *sf)
1423111028Sjeff{
1424111028Sjeff    w->current_stack_frame = sf;
1425111028Sjeff    sf->worker = w;
1426111028Sjeff
1427111028Sjeff    // Lots of debugging checks on the state of the fiber we might be
1428111028Sjeff    // resuming.
1429111028Sjeff#if FIBER_DEBUG >= 1
1430111028Sjeff#   if FIBER_DEBUG >= 3
1431111028Sjeff    {
1432111028Sjeff        fprintf(stderr, "w=%d: ff=%p, sf=%p. about to resume user code\n",
1433111028Sjeff                w->self, ff, sf);
1434105854Sjulian    }
143599026Sjulian#   endif
1436103410Smini
1437108338Sjulian    const int flags = sf->flags;
143899026Sjulian    CILK_ASSERT(flags & CILK_FRAME_SUSPENDED);
143999026Sjulian    CILK_ASSERT(!sf->call_parent);
1440111028Sjeff    CILK_ASSERT(w->head == w->tail);
144199026Sjulian
144299026Sjulian    /* A frame can not be resumed unless it was suspended. */
144399026Sjulian    CILK_ASSERT(ff->sync_sp != NULL);
144499026Sjulian
1445104695Sjulian    /* The leftmost frame has no allocated stack */
1446104695Sjulian    if (ff->simulated_stolen)
1447111028Sjeff        CILK_ASSERT(flags & CILK_FRAME_UNSYNCHED);
1448111028Sjeff    else if (flags & CILK_FRAME_UNSYNCHED)
1449111028Sjeff        /* XXX By coincidence sync_sp could be null. */
1450104695Sjulian        CILK_ASSERT(ff->fiber_self != NULL);
1451111028Sjeff    else
1452104695Sjulian        /* XXX This frame could be resumed unsynched on the leftmost stack */
1453104695Sjulian        CILK_ASSERT((ff->sync_master == 0 || ff->sync_master == w));
145499026Sjulian    CILK_ASSERT(w->l->frame_ff == ff);
1455111028Sjeff#endif
1456106182Sdavidxu}
145799026Sjulian
145899026Sjulian
1459104695Sjulian/**
1460103002Sjulian * This method is the first method that should execute after we've
1461103002Sjulian * switched to a scheduling fiber from user code.
1462111028Sjeff *
1463113244Sdavidxu * @param fiber The scheduling fiber for the current worker.
1464115858Smarcel * @param wptr  The current worker.
1465111028Sjeff */
1466111028Sjeffstatic void enter_runtime_transition_proc(cilk_fiber *fiber)
1467111028Sjeff{
1468116401Sdavidxu    // We can execute this method for one of three reasons:
1469116372Sdavidxu    // 1. Undo-detach finds parent stolen.
1470111028Sjeff    // 2. Sync suspends frame.
1471111028Sjeff    // 3. Return from Cilk entry point.
1472104695Sjulian    //
1473116963Sdavidxu    //
1474116963Sdavidxu    // In cases 1 and 2, the frame may be truly suspended or
1475104695Sjulian    // may be immediately executed by this worker after provably_good_steal.
147699026Sjulian    //
147799026Sjulian    //
1478116963Sdavidxu    // There is a fourth case, which can, but does not need to execute
1479116963Sdavidxu    // this function:
1480116963Sdavidxu    //   4. Starting up the scheduling loop on a user or
1481116963Sdavidxu    //      system worker.  In this case, we won't have
1482111033Sjeff    //      a scheduling stack function to run.
1483111033Sjeff    __cilkrts_worker* w = cilk_fiber_get_owner(fiber);
1484103410Smini    if (w->l->post_suspend) {
1485111033Sjeff        // Run the continuation function passed to longjmp_into_runtime
1486116963Sdavidxu        run_scheduling_stack_fcn(w);
1487116963Sdavidxu
1488103410Smini        // After we have jumped into the runtime and run the
1489103410Smini        // scheduling function, any reducer map the worker had before entering the runtime
1490115884Sdavidxu        // should have already been saved into the appropriate full
1491115884Sdavidxu        // frame.
1492116963Sdavidxu        CILK_ASSERT(NULL == w->reducer_map);
1493116963Sdavidxu
1494116963Sdavidxu        // There shouldn't be any uncaught exceptions.
1495116963Sdavidxu        //
1496116963Sdavidxu        // In Windows, the OS catches any exceptions not caught by the
1497103410Smini        // user code.  Thus, we are omitting the check on Windows.
1498116963Sdavidxu        //
1499116963Sdavidxu        // On Android, calling std::uncaught_exception with the stlport
1500116963Sdavidxu        // library causes a seg fault.  Since we're not supporting
1501116963Sdavidxu        // exceptions there at this point, just don't do the check
1502116963Sdavidxu        //
1503103410Smini        // TBD: Is this check also safe to do on Windows?
1504116963Sdavidxu        CILKBUG_ASSERT_NO_UNCAUGHT_EXCEPTION();
1505116963Sdavidxu    }
1506111033Sjeff}
1507111033Sjeff
1508111033Sjeff
1509112397Sdavidxu/**
1510112397Sdavidxu * Method called to jump back to executing user code.
1511112397Sdavidxu *
1512116607Sdavidxu * A normal return from the runtime back to resuming user code calls
1513112397Sdavidxu * this method.  A computation executed using force_reduce also calls
1514112397Sdavidxu * this method to return to user code.
1515112397Sdavidxu *
1516112397Sdavidxu * This function should not contain any code that depends on a fiber.
1517112397Sdavidxu * In a force-reduce case, the user worker may not have a fiber.  In
1518112397Sdavidxu * the force-reduce case, we call this method directly instead of
1519112397Sdavidxu * calling @c user_code_resume_after_switch_into_runtime.
1520112397Sdavidxu */
1521112397Sdavidxustatic inline NORETURN
1522112397Sdavidxucilkrts_resume(__cilkrts_stack_frame *sf, full_frame *ff)
1523112397Sdavidxu{
1524112397Sdavidxu    // Save the sync stack pointer, and do the bookkeeping
1525112397Sdavidxu    char* sync_sp = ff->sync_sp;
1526112397Sdavidxu    __cilkrts_take_stack(ff, sync_sp);  // leaves ff->sync_sp null
1527112397Sdavidxu
1528112397Sdavidxu    sf->flags &= ~CILK_FRAME_SUSPENDED;
1529112397Sdavidxu    // Actually longjmp to the user code.
1530112397Sdavidxu    // We may have exceptions to deal with, since we are resuming
1531112397Sdavidxu    // a previous-suspended frame.
1532112397Sdavidxu    sysdep_longjmp_to_sf(sync_sp, sf, ff);
1533112397Sdavidxu}
1534112397Sdavidxu
1535112397Sdavidxu
1536112397Sdavidxu/**
1537116607Sdavidxu * Called by the user-code fiber right before resuming a full frame
1538116607Sdavidxu * (sf/ff).
1539112397Sdavidxu *
1540112397Sdavidxu * This method pulls sf/ff out of the worker, and then calls
1541112397Sdavidxu * cilkrts_resume to jump to user code.
1542103410Smini */
1543111028Sjeffstatic NORETURN
1544105900Sjulianuser_code_resume_after_switch_into_runtime(cilk_fiber *fiber)
1545105900Sjulian{
1546105900Sjulian    __cilkrts_worker *w = cilk_fiber_get_owner(fiber);
1547105900Sjulian    __cilkrts_stack_frame *sf;
1548105900Sjulian    full_frame *ff;
1549111028Sjeff    sf = w->current_stack_frame;
1550111028Sjeff    ff = sf->worker->l->frame_ff;
1551113793Sdavidxu
1552105900Sjulian#if FIBER_DEBUG >= 1
1553111028Sjeff    CILK_ASSERT(ff->fiber_self == fiber);
1554113793Sdavidxu    cilk_fiber_data *fdata = cilk_fiber_get_data(fiber);
1555105900Sjulian    DBGPRINTF ("%d-%p: resume_after_switch_into_runtime, fiber=%p\n",
1556105900Sjulian               w->self, w, fiber);
1557105900Sjulian    CILK_ASSERT(sf == fdata->resume_sf);
1558105900Sjulian#endif
1559116401Sdavidxu
1560116401Sdavidxu    // Notify the Intel tools that we're stealing code
1561105900Sjulian    ITT_SYNC_ACQUIRED(sf->worker);
1562112071Sdavidxu    NOTIFY_ZC_INTRINSIC("cilk_continue", sf);
1563105900Sjulian    cilk_fiber_invoke_tbb_stack_op(fiber, CILK_TBB_STACK_ADOPT);
1564105900Sjulian
1565105900Sjulian    // Actually jump to user code.
1566105900Sjulian    cilkrts_resume(sf, ff);
1567105900Sjulian }
1568105900Sjulian
1569105900Sjulian
1570108338Sjulian/* The current stack is about to either be suspended or destroyed.  This
1571105900Sjulian * function will switch to the stack on which the scheduler is suspended and
1572105900Sjulian * resume running the scheduler within function do_work().  Upon waking up,
1573111028Sjeff * the scheduler will run the 'cont' function, using the supplied worker and
1574116401Sdavidxu * frame.
1575111028Sjeff */
1576111028Sjeffstatic NORETURN
1577111028Sjefflongjmp_into_runtime(__cilkrts_worker *w,
1578113793Sdavidxu                     scheduling_stack_fcn_t fcn,
1579117000Smarcel                     __cilkrts_stack_frame *sf)
1580113793Sdavidxu{
1581113793Sdavidxu    full_frame *ff, *ff2;
1582111028Sjeff
1583105900Sjulian    CILK_ASSERT(!w->l->post_suspend);
1584113793Sdavidxu    ff = w->l->frame_ff;
1585111115Sdavidxu
1586111115Sdavidxu    // If we've got only one worker, stealing shouldn't be possible.
1587111115Sdavidxu    // Assume that this is a steal or return from spawn in a force-reduce case.
1588113793Sdavidxu    // We don't have a scheduling stack to switch to, so call the continuation
1589113793Sdavidxu    // function directly.
1590113793Sdavidxu    if (1 == w->g->P) {
1591113793Sdavidxu        fcn(w, ff, sf);
1592111115Sdavidxu
1593105900Sjulian        /* The call to function c() will have pushed ff as the next frame.  If
1594105900Sjulian         * this were a normal (non-forced-reduce) execution, there would have
1595105900Sjulian         * been a pop_next_frame call in a separate part of the runtime.  We
1596105900Sjulian         * must call pop_next_frame here to complete the push/pop cycle. */
1597105900Sjulian        ff2 = pop_next_frame(w);
1598103410Smini
1599103410Smini        setup_for_execution(w, ff2, 0);
1600103410Smini        scheduling_fiber_prepare_to_resume_user_code(w, ff2, w->current_stack_frame);
160199026Sjulian        cilkrts_resume(w->current_stack_frame, ff2);
160299026Sjulian
160399026Sjulian// Suppress clang warning that the expression result is unused
160499026Sjulian#if defined(__clang__) && (! defined(__INTEL_COMPILER))
1605103410Smini#   pragma clang diagnostic push
1606103410Smini#   pragma clang diagnostic ignored "-Wunused-value"
1607103410Smini#endif // __clang__
160899026Sjulian        /* no return */
160999026Sjulian        CILK_ASSERT(((void)"returned from __cilkrts_resume", 0));
1610103838Sjulian#if defined(__clang__) && (! defined(__INTEL_COMPILER))
161199026Sjulian#   pragma clang diagnostic pop
1612113793Sdavidxu#endif // __clang__
1613111028Sjeff    }
1614111115Sdavidxu
1615104695Sjulian    w->l->post_suspend = fcn;
1616107060Sdavidxu    w->l->suspended_stack = sf;
161799026Sjulian
1618111028Sjeff    ITT_SYNC_RELEASING(w);
1619110190Sjulian    ITT_SYNC_PREPARE(w);
1620116401Sdavidxu
1621104695Sjulian#if FIBER_DEBUG >= 2
1622116401Sdavidxu    fprintf(stderr, "ThreadId=%p, W=%d: about to switch into runtime... w->l->frame_ff = %p, sf=%p\n",
1623116401Sdavidxu            cilkos_get_current_thread_id(),
1624111028Sjeff            w->self, w->l->frame_ff,
1625108338Sjulian            sf);
1626103410Smini#endif
1627111028Sjeff
1628111028Sjeff    // Current fiber is either the (1) one we are about to free,
1629111028Sjeff    // or (2) it has been passed up to the parent.
1630103410Smini    cilk_fiber *current_fiber = ( w->l->fiber_to_free ?
1631111028Sjeff                                  w->l->fiber_to_free :
1632111515Sdavidxu                                  w->l->frame_ff->parent->fiber_child );
1633111028Sjeff    cilk_fiber_data* fdata = cilk_fiber_get_data(current_fiber);
1634111028Sjeff    CILK_ASSERT(NULL == w->l->frame_ff->fiber_self);
1635111028Sjeff
1636116401Sdavidxu    // Clear the sf in the current fiber for cleanliness, to prevent
1637111515Sdavidxu    // us from accidentally resuming a bad sf.
1638111515Sdavidxu    // Technically, resume_sf gets overwritten for a fiber when
1639111028Sjeff    // we are about to resume it anyway.
1640108338Sjulian    fdata->resume_sf = NULL;
1641113793Sdavidxu    CILK_ASSERT(fdata->owner == w);
1642111028Sjeff
1643111028Sjeff    // Set the function to execute immediately after switching to the
1644111028Sjeff    // scheduling fiber, but before freeing any fibers.
1645111028Sjeff    cilk_fiber_set_post_switch_proc(w->l->scheduling_fiber,
1646111028Sjeff                                    enter_runtime_transition_proc);
1647111028Sjeff    cilk_fiber_invoke_tbb_stack_op(current_fiber, CILK_TBB_STACK_ORPHAN);
1648108338Sjulian
1649111028Sjeff    if (w->l->fiber_to_free) {
1650111028Sjeff        // Case 1: we are freeing this fiber.  We never
1651112888Sjeff        // resume this fiber again after jumping into the runtime.
1652112077Sdavidxu        w->l->fiber_to_free = NULL;
1653112397Sdavidxu
1654113708Sdavidxu        // Extra check. Normally, the fiber we are about to switch to
1655112888Sjeff        // should have a NULL owner.
1656111515Sdavidxu        CILK_ASSERT(NULL == cilk_fiber_get_data(w->l->scheduling_fiber)->owner);
1657112222Sdavidxu#if FIBER_DEBUG >= 4
1658112397Sdavidxu        fprintf(stderr, "ThreadId=%p, W=%d: about to switch into runtime.. current_fiber = %p, deallcoate, switch to fiber %p\n",
1659112222Sdavidxu                cilkos_get_current_thread_id(),
1660112222Sdavidxu                w->self,
1661112077Sdavidxu                current_fiber, w->l->scheduling_fiber);
1662113793Sdavidxu#endif
1663112222Sdavidxu        cilk_fiber_invoke_tbb_stack_op(current_fiber, CILK_TBB_STACK_RELEASE);
1664112222Sdavidxu        NOTE_INTERVAL(w, INTERVAL_DEALLOCATE_RESUME_OTHER);
1665112077Sdavidxu        cilk_fiber_remove_reference_from_self_and_resume_other(current_fiber,
1666108338Sjulian                                                               &w->l->fiber_pool,
1667112888Sjeff                                                               w->l->scheduling_fiber);
1668104695Sjulian        // We should never come back here!
1669104695Sjulian        CILK_ASSERT(0);
1670104695Sjulian    }
1671111028Sjeff    else {
1672104695Sjulian        // Case 2: We are passing the fiber to our parent because we
1673108338Sjulian        // are leftmost.  We should come back later to
1674104695Sjulian        // resume execution of user code.
1675113793Sdavidxu        //
1676104695Sjulian        // If we are not freeing a fiber, there we must be
1677104695Sjulian        // returning from a spawn or processing an exception.  The
1678111028Sjeff        // "sync" path always frees a fiber.
1679111028Sjeff        //
1680111028Sjeff        // We must be the leftmost child, and by left holder logic, we
1681104695Sjulian        // have already moved the current fiber into our parent full
1682116372Sdavidxu        // frame.
1683113793Sdavidxu#if FIBER_DEBUG >= 2
1684116963Sdavidxu        fprintf(stderr, "ThreadId=%p, W=%d: about to suspend self into runtime.. current_fiber = %p, deallcoate, switch to fiber %p\n",
1685116963Sdavidxu                cilkos_get_current_thread_id(),
1686116963Sdavidxu                w->self,
1687116963Sdavidxu                current_fiber, w->l->scheduling_fiber);
1688116963Sdavidxu#endif
1689116963Sdavidxu
1690116963Sdavidxu        NOTE_INTERVAL(w, INTERVAL_SUSPEND_RESUME_OTHER);
1691108338Sjulian
1692112071Sdavidxu        cilk_fiber_suspend_self_and_resume_other(current_fiber,
1693112071Sdavidxu                                                 w->l->scheduling_fiber);
1694112071Sdavidxu        // Resuming this fiber returns control back to
1695112071Sdavidxu        // this function because our implementation uses OS fibers.
1696112071Sdavidxu        //
1697112071Sdavidxu        // On Unix, we could have the choice of passing the
1698112071Sdavidxu        // user_code_resume_after_switch_into_runtime as an extra "resume_proc"
1699112071Sdavidxu        // that resumes execution of user code instead of the
1700112071Sdavidxu        // jumping back here, and then jumping back to user code.
1701112071Sdavidxu#if FIBER_DEBUG >= 2
1702112071Sdavidxu        CILK_ASSERT(fdata->owner == __cilkrts_get_tls_worker());
1703108338Sjulian#endif
1704111028Sjeff        user_code_resume_after_switch_into_runtime(current_fiber);
1705104695Sjulian    }
1706104695Sjulian}
1707116401Sdavidxu
1708111154Sdavidxu/*
1709111154Sdavidxu * Send a message to the children of the specified worker: run or wait.
1710111154Sdavidxu */
1711111154Sdavidxustatic void notify_children(__cilkrts_worker *w, unsigned int msg)
1712111154Sdavidxu{
1713113920Sjhb    int child_num;
1714116184Sdavidxu    __cilkrts_worker *child;
1715111154Sdavidxu    int num_sys_workers = w->g->P - 1;
1716111154Sdavidxu
1717111154Sdavidxu    // If worker is "n", then its children are 2n + 1, and 2n + 2.
1718111154Sdavidxu    child_num = (w->self << 1) + 1;
1719111154Sdavidxu    if (child_num < num_sys_workers) {
1720111154Sdavidxu        child = w->g->workers[child_num];
1721111154Sdavidxu        CILK_ASSERT(child->l->signal_node);
1722111154Sdavidxu        signal_node_msg(child->l->signal_node, msg);
1723111154Sdavidxu        child_num++;
1724111154Sdavidxu        if (child_num < num_sys_workers) {
1725114106Sdavidxu            child = w->g->workers[child_num];
1726116138Sdavidxu            CILK_ASSERT(child->l->signal_node);
1727116184Sdavidxu            signal_node_msg(child->l->signal_node, msg);
1728116184Sdavidxu        }
1729116184Sdavidxu    }
1730116184Sdavidxu}
1731116184Sdavidxu
1732116184Sdavidxu/*
1733111154Sdavidxu * Notify this worker's children that they need to wait.
1734116184Sdavidxu */
1735113920Sjhbstatic void notify_children_wait(__cilkrts_worker *w)
1736111154Sdavidxu{
1737111154Sdavidxu    notify_children(w, 0);
1738111154Sdavidxu}
1739116372Sdavidxu
1740113793Sdavidxu/*
1741112397Sdavidxu * Notify this worker's children to run and start trying to steal.
1742108338Sjulian */
1743108338Sjulianstatic void notify_children_run(__cilkrts_worker *w)
1744111028Sjeff{
1745108338Sjulian    notify_children(w, 1);
1746108338Sjulian}
1747108338Sjulian
1748108338Sjulian/**
1749104695Sjulian * A single "check" to find work, either on our queue or through a
1750116372Sdavidxu * steal attempt.  This method checks our local queue once, and
1751116401Sdavidxu * performs one steal attempt.
1752116401Sdavidxu */
1753111028Sjeffstatic full_frame* check_for_work(__cilkrts_worker *w)
1754116401Sdavidxu{
1755116401Sdavidxu    full_frame *ff = NULL;
1756111028Sjeff    ff = pop_next_frame(w);
1757113793Sdavidxu    // If there is no work on the queue, try to steal some.
1758113793Sdavidxu    if (NULL == ff) {
1759113793Sdavidxu        START_INTERVAL(w, INTERVAL_STEALING) {
1760113793Sdavidxu            if (w->l->type != WORKER_USER && w->l->team != NULL) {
1761113793Sdavidxu                // At this point, the worker knows for certain that it has run
1762113793Sdavidxu                // out of work.  Therefore, it loses its team affiliation.  User
1763113793Sdavidxu                // workers never change teams, of course.
1764113793Sdavidxu                __cilkrts_worker_lock(w);
1765113793Sdavidxu                w->l->team = NULL;
1766113793Sdavidxu                __cilkrts_worker_unlock(w);
1767108338Sjulian            }
1768108338Sjulian
1769108338Sjulian            // If we are about to do a random steal, we should have no
1770108338Sjulian            // full frame...
1771108338Sjulian            CILK_ASSERT(NULL == w->l->frame_ff);
1772108338Sjulian            random_steal(w);
1773113793Sdavidxu        } STOP_INTERVAL(w, INTERVAL_STEALING);
1774113793Sdavidxu
1775111115Sdavidxu        // If the steal was successful, then the worker has populated its next
1776113793Sdavidxu        // frame with the work to resume.
1777113793Sdavidxu        ff = pop_next_frame(w);
1778107060Sdavidxu        if (NULL == ff) {
1779113793Sdavidxu            // Punish the worker for failing to steal.
1780111115Sdavidxu            // No quantum for you!
1781111115Sdavidxu            __cilkrts_yield();
1782111115Sdavidxu            w->l->steal_failure_count++;
1783111115Sdavidxu        } else {
1784111115Sdavidxu            // Reset steal_failure_count since there is obviously still work to
1785111129Sdavidxu            // be done.
1786111129Sdavidxu            w->l->steal_failure_count = 0;
1787111115Sdavidxu        }
1788111115Sdavidxu    }
1789111115Sdavidxu    return ff;
1790111115Sdavidxu}
1791111115Sdavidxu
1792111115Sdavidxu/**
1793111115Sdavidxu * Keep stealing or looking on our queue.
1794111115Sdavidxu *
1795111115Sdavidxu * Returns either when a full frame is found, or NULL if the
1796111115Sdavidxu * computation is done.
1797111115Sdavidxu */
1798111115Sdavidxustatic full_frame* search_until_work_found_or_done(__cilkrts_worker *w)
1799111115Sdavidxu{
1800111115Sdavidxu    full_frame *ff = NULL;
1801111115Sdavidxu    // Find a full frame to execute (either through random stealing,
1802113793Sdavidxu    // or because we pull it off w's 1-element queue).
1803111028Sjeff    while (!ff) {
1804111028Sjeff        // Check worker state to figure out our next action.
1805111028Sjeff        switch (worker_runnable(w))
1806111028Sjeff        {
1807111028Sjeff        case SCHEDULE_RUN:             // One attempt at checking for work.
1808111028Sjeff            ff = check_for_work(w);
1809108338Sjulian            break;
1810111028Sjeff        case SCHEDULE_WAIT:            // go into wait-mode.
1811104695Sjulian            CILK_ASSERT(WORKER_SYSTEM == w->l->type);
181299026Sjulian            // If we are about to wait, then we better not have
181399026Sjulian            // a frame that we should execute...
181499026Sjulian            CILK_ASSERT(NULL == w->l->next_frame_ff);
181599026Sjulian            notify_children_wait(w);
181699026Sjulian            signal_node_wait(w->l->signal_node);
181799026Sjulian            // ...
181899026Sjulian            // Runtime is waking up.
181999026Sjulian            notify_children_run(w);
182099026Sjulian            w->l->steal_failure_count = 0;
182199026Sjulian            break;
182299026Sjulian        case SCHEDULE_EXIT:            // exit the scheduler.
182399026Sjulian            CILK_ASSERT(WORKER_USER != w->l->type);
182499026Sjulian            return NULL;
182599026Sjulian        default:
182699026Sjulian            CILK_ASSERT(0);
182799026Sjulian            abort();
182899026Sjulian        }
182999026Sjulian    }
183099026Sjulian    return ff;
183199026Sjulian}
183299026Sjulian
183399026Sjulian/**
183499026Sjulian * The proc method for a scheduling fiber on a user worker.
183599026Sjulian *
1836107719Sjulian * When a user worker jumps into the runtime, it jumps into this
183799026Sjulian * method by either starting it if the scheduling fiber has never run
183899026Sjulian * before, or resuming the fiber if it was previously suspended.
183999026Sjulian */
1840116361SdavidxuCOMMON_PORTABLE
184199026Sjulianvoid scheduler_fiber_proc_for_user_worker(cilk_fiber *fiber)
184299026Sjulian{
1843100648Sjulian    __cilkrts_worker* w = cilk_fiber_get_owner(fiber);
1844100648Sjulian    CILK_ASSERT(w);
184599026Sjulian
184699026Sjulian    // This must be a user worker
1847108338Sjulian    CILK_ASSERT(WORKER_USER == w->l->type);
184899026Sjulian
1849108338Sjulian    // If we aren't the current worker, then something is very wrong
185099026Sjulian    // here..
1851102950Sdavidxu    verify_current_wkr(w);
1852113920Sjhb
185399026Sjulian    __cilkrts_run_scheduler_with_exceptions(w);
185499026Sjulian}
185599026Sjulian
185699026Sjulian
185799026Sjulian/**
1858113705Sdavidxu * The body of the runtime scheduling loop.  This function executes in
1859103216Sjulian * 4 stages:
1860105911Sjulian *
1861105911Sjulian * 1. Transitions from the user code into the runtime by
1862103216Sjulian *    executing any scheduling-stack functions.
1863105911Sjulian * 2. Looks for a full frame enqueued from a successful provably
1864105911Sjulian *    good steal.
1865105911Sjulian * 3. If no full frame is found in step 2, steal until
1866105911Sjulian *    a frame is found or we are done.  If we are done, finish
1867105911Sjulian *    the scheduling loop.
1868105911Sjulian * 4. When a frame is found, setup to resume user code.
1869105911Sjulian *    In particular, suspend the current fiber and resume the
1870105911Sjulian *    user fiber to execute the frame.
1871105911Sjulian *
1872105911Sjulian * Returns a fiber object that we should switch to after completing
1873105874Sdavidxu * the body of the loop, or NULL if we should continue executing on
1874111028Sjeff * this fiber.
1875111028Sjeff *
1876111028Sjeff * @pre @c current_fiber should equal @c wptr->l->scheduling_fiber
1877111028Sjeff *
1878111028Sjeff * @param current_fiber   The currently executing (scheduling_ fiber
1879108338Sjulian * @param wptr            The currently executing worker.
1880111028Sjeff * @param return          The next fiber we should switch to.
1881105911Sjulian */
188299026Sjulianstatic cilk_fiber* worker_scheduling_loop_body(cilk_fiber* current_fiber,
188399026Sjulian                                               void* wptr)
188499026Sjulian{
1885105911Sjulian    __cilkrts_worker *w = (__cilkrts_worker*) wptr;
1886105911Sjulian    CILK_ASSERT(current_fiber == w->l->scheduling_fiber);
1887105911Sjulian
1888113920Sjhb    // Stage 1: Transition from executing user code to the runtime code.
1889105911Sjulian    // We don't need to do this call here any more, because
1890105911Sjulian    // every switch to the scheduling fiber should make this call
189199026Sjulian    // using a post_switch_proc on the fiber.
189299026Sjulian    //
1893100648Sjulian    //  enter_runtime_transition_proc(w->l->scheduling_fiber, wptr);
189499026Sjulian
1895103216Sjulian    // After Stage 1 is complete, w should no longer have
1896113795Sdavidxu    // an associated full frame.
189799026Sjulian    CILK_ASSERT(NULL == w->l->frame_ff);
1898107719Sjulian
189999026Sjulian    // Stage 2.  First do a quick check of our 1-element queue.
190099026Sjulian    full_frame *ff = pop_next_frame(w);
1901113795Sdavidxu
190299026Sjulian    if (!ff) {
1903113920Sjhb        // Stage 3.  We didn't find anything from our 1-element
190499026Sjulian        // queue.  Now go through the steal loop to find work.
1905111028Sjeff        ff = search_until_work_found_or_done(w);
1906113920Sjhb        if (!ff) {
1907111028Sjeff            CILK_ASSERT(w->g->work_done);
1908105854Sjulian            return NULL;
1909111028Sjeff        }
1910113920Sjhb    }
191199026Sjulian
191299026Sjulian    // Stage 4.  Now that we have found a full frame to work on,
191399026Sjulian    // actually execute it.
191499026Sjulian    __cilkrts_stack_frame *sf;
191599026Sjulian
191699026Sjulian    // There shouldn't be any uncaught exceptions.
191799026Sjulian    //
191899026Sjulian    // In Windows, the OS catches any exceptions not caught by the
191999026Sjulian    // user code.  Thus, we are omitting the check on Windows.
192099026Sjulian    //
192199026Sjulian    // On Android, calling std::uncaught_exception with the stlport
192299026Sjulian    // library causes a seg fault.  Since we're not supporting
192399026Sjulian    // exceptions there at this point, just don't do the check
192499026Sjulian    CILKBUG_ASSERT_NO_UNCAUGHT_EXCEPTION();
192599026Sjulian
192699026Sjulian    BEGIN_WITH_WORKER_LOCK(w) {
192799026Sjulian        CILK_ASSERT(!w->l->frame_ff);
192899026Sjulian        BEGIN_WITH_FRAME_LOCK(w, ff) {
192999026Sjulian            sf = ff->call_stack;
193099026Sjulian            CILK_ASSERT(sf && !sf->call_parent);
193199026Sjulian            setup_for_execution(w, ff, 0);
193299026Sjulian        } END_WITH_FRAME_LOCK(w, ff);
193399026Sjulian    } END_WITH_WORKER_LOCK(w);
193499026Sjulian
193599026Sjulian    /* run it */
193699026Sjulian    //
193799026Sjulian    // Prepare to run the full frame.  To do so, we need to:
193899026Sjulian    //   (a) Execute some code on this fiber (the scheduling
193999026Sjulian    //       fiber) to set up data structures, and
194099026Sjulian    //   (b) Suspend the scheduling fiber, and resume the
194199026Sjulian    //       user-code fiber.
194299026Sjulian
194399026Sjulian    // Part (a). Set up data structures.
194499026Sjulian    scheduling_fiber_prepare_to_resume_user_code(w, ff, sf);
194599026Sjulian
194699026Sjulian    cilk_fiber *other = w->l->frame_ff->fiber_self;
194799026Sjulian    cilk_fiber_data* other_data = cilk_fiber_get_data(other);
194899026Sjulian    cilk_fiber_data* current_fiber_data = cilk_fiber_get_data(current_fiber);
194999026Sjulian
1950104502Sjmallett    // I believe two cases are possible here, both of which
1951104502Sjmallett    // should have other_data->resume_sf as NULL.
195299026Sjulian    //
195399026Sjulian    // 1. Resuming a fiber that was previously executing
195499026Sjulian    //    user code (i.e., a provably-good-steal).
195599026Sjulian    //    In this case, resume_sf should have been
195699026Sjulian    //    set to NULL when it was suspended.
1957102950Sdavidxu    //
195899026Sjulian    // 2. Resuming code on a steal.  In this case, since we
195999026Sjulian    //    grabbed a new fiber, resume_sf should be NULL.
196099026Sjulian    CILK_ASSERT(NULL == other_data->resume_sf);
1961100648Sjulian
1962100648Sjulian#if FIBER_DEBUG >= 2
1963100646Sjulian    fprintf(stderr, "W=%d: other fiber=%p, setting resume_sf to %p\n",
1964100646Sjulian            w->self, other, other_data->resume_sf);
196599026Sjulian#endif
1966100648Sjulian    // Update our own fiber's data.
196799026Sjulian    current_fiber_data->resume_sf = NULL;
196899026Sjulian    // The scheduling fiber should have the right owner from before.
1969100648Sjulian    CILK_ASSERT(current_fiber_data->owner == w);
197099026Sjulian    other_data->resume_sf = sf;
197199026Sjulian
1972112071Sdavidxu
1973112071Sdavidxu#if FIBER_DEBUG >= 3
197499026Sjulian    fprintf(stderr, "ThreadId=%p (about to suspend self resume other), W=%d: current_fiber=%p, other=%p, current_fiber->resume_sf = %p, other->resume_sf = %p\n",
197599026Sjulian            cilkos_get_current_thread_id(),
197699026Sjulian            w->self,
1977102950Sdavidxu            current_fiber, other,
197899026Sjulian            current_fiber_data->resume_sf,
197999026Sjulian            other_data->resume_sf);
198099026Sjulian#endif
198199026Sjulian    return other;
1982116361Sdavidxu}
1983112910Sjeff
1984112910Sjeff
1985112910Sjeff/**
198699026Sjulian * This function is executed once by each worker, to initialize its
198799026Sjulian * scheduling loop.
198899026Sjulian */
198999026Sjulianstatic void worker_scheduler_init_function(__cilkrts_worker *w)
199099026Sjulian{
199199026Sjulian    // First, execute the startup tasks that must happen for all
199299026Sjulian    // worker types.
1993103216Sjulian    ITT_SYNC_PREPARE(w);
1994102950Sdavidxu    /* Notify tools about the new worker. Inspector needs this, but we
1995100632Sjulian       don't want to confuse Cilkscreen with system threads.  User threads
1996103216Sjulian       do this notification in bind_thread */
1997100632Sjulian    if (! w->g->under_ptool)
1998100632Sjulian        __cilkrts_cilkscreen_establish_worker(w);
1999114398Sdavidxu
2000113864Sjhb    // Seed the initial random number generator.
2001100594Sjulian    // If we forget to do this, then the worker always steals from 0.
200299026Sjulian    // Programs will still execute correctly, but
200399026Sjulian    // you may see a subtle performance bug...
2004114398Sdavidxu    mysrand(w, (w->self + 1));
200599026Sjulian
200699026Sjulian    // The startup work varies, depending on the worker type.
200799026Sjulian    switch (w->l->type) {
200899026Sjulian    case WORKER_USER:
200999026Sjulian        // Stop working once we've entered the scheduler.
2010102898Sdavidxu        // For user workers, INTERVAL_IN_SCHEDULER counts the time
2011102898Sdavidxu        // since we called bind_thread.
2012102898Sdavidxu        break;
2013102898Sdavidxu
2014102898Sdavidxu    case WORKER_SYSTEM:
2015102898Sdavidxu        // If a system worker is starting, we must also be starting
2016113920Sjhb        // the runtime.
2017112071Sdavidxu
2018102898Sdavidxu        // Runtime begins in a wait-state and is woken up by the first user
2019103216Sjulian        // worker when the runtime is ready.
2020102898Sdavidxu        signal_node_wait(w->l->signal_node);
2021103216Sjulian        // ...
2022103216Sjulian        // Runtime is waking up.
2023103216Sjulian        notify_children_run(w);
2024103216Sjulian        w->l->steal_failure_count = 0;
2025105911Sjulian
2026103216Sjulian        // For system threads, count all the time this thread is
2027103216Sjulian        // alive in the scheduling loop.
2028103216Sjulian        START_INTERVAL(w, INTERVAL_IN_SCHEDULER);
2029102898Sdavidxu        START_INTERVAL(w, INTERVAL_WORKING);
2030102898Sdavidxu        break;
2031102898Sdavidxu    default:
2032102898Sdavidxu        __cilkrts_bug("Unknown worker %p of type %d entering scheduling loop\n",
2033102898Sdavidxu                      w, w->l->type);
2034102898Sdavidxu    }
2035102898Sdavidxu}
2036102898Sdavidxu
2037113920Sjhb/**
2038102898Sdavidxu * This function is executed once by each worker, to finish its
2039103216Sjulian * scheduling loop.
2040102898Sdavidxu *
2041103216Sjulian * @note Currently, only system workers finish their loops.  User
2042102898Sdavidxu * workers will jump away to user code without exiting their
2043102898Sdavidxu * scheduling loop.
204499026Sjulian */
204599026Sjulianstatic void worker_scheduler_terminate_function(__cilkrts_worker *w)
204699026Sjulian{
204799026Sjulian    // A user worker should never finish by falling through the
204899026Sjulian    // scheduling loop.
204999026Sjulian    CILK_ASSERT(WORKER_USER != w->l->type);
205099026Sjulian    STOP_INTERVAL(w, INTERVAL_IN_RUNTIME);
205199026Sjulian    STOP_INTERVAL(w, INTERVAL_IN_SCHEDULER);
2052100646Sjulian}
205399026Sjulian
205499026Sjulian/**
205599026Sjulian * The main scheduler function executed by a worker's scheduling
2056102898Sdavidxu * fiber.
205799026Sjulian *
2058102950Sdavidxu * This method is started by either a new system worker, or a user
205999026Sjulian * worker that has stalled and just been imported into the runtime.
206099026Sjulian */
206199026Sjulianstatic void worker_scheduler_function(__cilkrts_worker *w)
206299026Sjulian{
206399026Sjulian    worker_scheduler_init_function(w);
206499026Sjulian
2065102898Sdavidxu    // The main scheduling loop body.
206699026Sjulian
206799026Sjulian    while (!w->g->work_done) {
206899026Sjulian        // Set intervals.  Now we are in the runtime instead of working.
206999026Sjulian        START_INTERVAL(w, INTERVAL_IN_RUNTIME);
207099026Sjulian        STOP_INTERVAL(w, INTERVAL_WORKING);
207199026Sjulian
207299026Sjulian        // Execute the "body" of the scheduling loop, and figure
207399026Sjulian        // out the fiber to jump to next.
207499026Sjulian        cilk_fiber* fiber_to_resume
207599026Sjulian            = worker_scheduling_loop_body(w->l->scheduling_fiber, w);
207699026Sjulian
207799026Sjulian        if (fiber_to_resume) {
2078102950Sdavidxu            // Suspend the current fiber and resume next one.
2079113920Sjhb            NOTE_INTERVAL(w, INTERVAL_SUSPEND_RESUME_OTHER);
208099026Sjulian            STOP_INTERVAL(w, INTERVAL_IN_RUNTIME);
2081102292Sjulian            START_INTERVAL(w, INTERVAL_WORKING);
2082102292Sjulian            cilk_fiber_suspend_self_and_resume_other(w->l->scheduling_fiber,
2083102292Sjulian                                                     fiber_to_resume);
2084102292Sjulian
2085102292Sjulian            // Return here only when this (scheduling) fiber is
2086102292Sjulian            // resumed (i.e., this worker wants to reenter the runtime).
2087102292Sjulian        }
2088102292Sjulian    }
2089103216Sjulian
2090102292Sjulian    // Finish the scheduling loop.
2091102292Sjulian    worker_scheduler_terminate_function(w);
2092113920Sjhb}
209399026Sjulian
209499026Sjulian
2095102292Sjulian/*************************************************************
2096  Forward declarations for reduction protocol.
2097*************************************************************/
2098
2099static __cilkrts_worker*
2100execute_reductions_for_sync(__cilkrts_worker *w,
2101                            full_frame *ff,
2102                            __cilkrts_stack_frame *sf_at_sync);
2103
2104static __cilkrts_worker*
2105execute_reductions_for_spawn_return(__cilkrts_worker *w,
2106                                    full_frame *ff,
2107                                    __cilkrts_stack_frame *returning_sf);
2108
2109
2110
2111/*************************************************************
2112  Scheduler functions that are callable by client code
2113*************************************************************/
2114static full_frame *disown(__cilkrts_worker *w,
2115                          full_frame *ff,
2116                          __cilkrts_stack_frame *sf,
2117                          const char *why)
2118{
2119    CILK_ASSERT(ff);
2120    make_unrunnable(w, ff, sf, sf != 0, why);
2121    w->l->frame_ff = 0;
2122    return ff->parent;
2123}
2124
2125/**
2126 * Called when ff is returning from a spawn, and we need to execute a
2127 * reduction.
2128 *
2129 * @param w             The currently executing worker.
2130 * @param ff            The full frame for w.
2131 * @param returning_sf  The stack frame for the spawn helper that is returning.
2132 *
2133 * Normally, by the time we gain control in the runtime, the worker
2134 * has already popped off the __cilkrts_stack_frame "returning_sf"
2135 * from its call chain.
2136 *
2137 * When we have only serial reductions, w->current_stack_frame is not
2138 * needed any more, because w is about to enter the runtime scheduling
2139 * loop anyway.  Similarly, the frame "ff" is slated to be destroyed
2140 * after the runtime finishes the return from spawn and splices ff out
2141 * of the tree of full frames.
2142 *
2143 * To execute a parallel reduction, however, we still want
2144 * w->current_stack_frame == returning_sf, and we are going to use the
2145 * frame ff for a little bit longer.
2146 *
2147 * This method:
2148 *
2149 *   1. Puts returning_sf back as w's current stack frame.
2150 *   2. Makes "ff" runnable again on w.
2151 */
2152static inline
2153void restore_frame_for_spawn_return_reduction(__cilkrts_worker *w,
2154                                              full_frame *ff,
2155                                              __cilkrts_stack_frame *returning_sf) {
2156#if REDPAR_DEBUG >= 2
2157    CILK_ASSERT(returning_sf);
2158    CILK_ASSERT(returning_sf->worker == w);
2159#endif
2160    // Change w's current stack frame back to "returning_sf".
2161    //
2162    // Intuitively, w->current_stack_frame should be
2163    // returning_sf->call_parent at this point.
2164    //
2165    // We can not assert this, however, because the pop of
2166    // returning_sf from the call chain has already cleared
2167    // returning_sf->call_parent.  We don't want to restore the call
2168    // parent of returning_sf, because its parent has been stolen, and
2169    // the runtime assumes that steals break this link.
2170
2171    // We cannot assert call_parent is NULL either, since that's not true for
2172    // Win64 exception handling
2173//    CILK_ASSERT(returning_sf->call_parent == NULL);
2174    w->current_stack_frame = returning_sf;
2175
2176    // Make the full frame "ff" runnable again, in preparation for
2177    // executing the reduction.
2178    make_runnable(w, ff);
2179}
2180
2181
2182NORETURN __cilkrts_c_sync(__cilkrts_worker *w,
2183                          __cilkrts_stack_frame *sf_at_sync)
2184{
2185    full_frame *ff;
2186
2187    // Claim: This read of w->l->frame_ff can occur without
2188    // holding the worker lock because when w has reached a sync
2189    // and entered the runtime (because it stalls), w's deque is empty
2190    // and no one else can steal and change w->l->frame_ff.
2191
2192    ff = w->l->frame_ff;
2193#ifdef _WIN32
2194    __cilkrts_save_exception_state(w, ff);
2195#else
2196    // Move any pending exceptions into the full frame
2197    CILK_ASSERT(NULL == ff->pending_exception);
2198    ff->pending_exception = w->l->pending_exception;
2199    w->l->pending_exception = NULL;
2200#endif
2201
2202    w = execute_reductions_for_sync(w, ff, sf_at_sync);
2203
2204#if FIBER_DEBUG >= 3
2205    fprintf(stderr, "ThreadId=%p, w->self = %d. about to longjmp_into_runtim[c_sync] with ff=%p\n",
2206            cilkos_get_current_thread_id(), w->self, ff);
2207#endif
2208
2209    longjmp_into_runtime(w, do_sync, sf_at_sync);
2210}
2211
2212static void do_sync(__cilkrts_worker *w, full_frame *ff,
2213                    __cilkrts_stack_frame *sf)
2214{
2215    //int abandoned = 1;
2216    enum provably_good_steal_t steal_result = ABANDON_EXECUTION;
2217
2218    START_INTERVAL(w, INTERVAL_SYNC_CHECK) {
2219        BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) {
2220
2221            CILK_ASSERT(ff);
2222            BEGIN_WITH_FRAME_LOCK(w, ff) {
2223                CILK_ASSERT(sf->call_parent == 0);
2224                CILK_ASSERT(sf->flags & CILK_FRAME_UNSYNCHED);
2225
2226                // Before switching into the scheduling fiber, we should have
2227                // already taken care of deallocating the current
2228                // fiber.
2229                CILK_ASSERT(NULL == ff->fiber_self);
2230
2231                // Update the frame's pedigree information if this is an ABI 1
2232                // or later frame
2233                if (CILK_FRAME_VERSION_VALUE(sf->flags) >= 1)
2234                {
2235                    sf->parent_pedigree.rank = w->pedigree.rank;
2236                    sf->parent_pedigree.parent = w->pedigree.parent;
2237
2238                    // Note that the pedigree rank needs to be updated
2239                    // when setup_for_execution_pedigree runs
2240                    sf->flags |= CILK_FRAME_SF_PEDIGREE_UNSYNCHED;
2241                }
2242
2243                /* the decjoin() occurs in provably_good_steal() */
2244                steal_result = provably_good_steal(w, ff);
2245
2246            } END_WITH_FRAME_LOCK(w, ff);
2247            // set w->l->frame_ff = NULL after checking abandoned
2248            if (WAIT_FOR_CONTINUE != steal_result) {
2249                w->l->frame_ff = NULL;
2250            }
2251        } END_WITH_WORKER_LOCK_OPTIONAL(w);
2252    } STOP_INTERVAL(w, INTERVAL_SYNC_CHECK);
2253
2254    // Now, if we are in a replay situation and provably_good_steal() returned
2255    // WAIT_FOR_CONTINUE, we should sleep, reacquire locks, call
2256    // provably_good_steal(), and release locks until we get a value other
2257    // than WAIT_FOR_CONTINUE from the function.
2258#ifdef CILK_RECORD_REPLAY
2259    // We don't have to explicitly check for REPLAY_LOG below because
2260    // steal_result can only be set to WAIT_FOR_CONTINUE during replay
2261    while(WAIT_FOR_CONTINUE == steal_result)
2262    {
2263        __cilkrts_sleep();
2264        BEGIN_WITH_WORKER_LOCK_OPTIONAL(w)
2265        {
2266            ff = w->l->frame_ff;
2267            BEGIN_WITH_FRAME_LOCK(w, ff)
2268            {
2269                steal_result = provably_good_steal(w, ff);
2270            } END_WITH_FRAME_LOCK(w, ff);
2271            if (WAIT_FOR_CONTINUE != steal_result)
2272                w->l->frame_ff = NULL;
2273        } END_WITH_WORKER_LOCK_OPTIONAL(w);
2274    }
2275#endif  // CILK_RECORD_REPLAY
2276
2277#ifdef ENABLE_NOTIFY_ZC_INTRINSIC
2278    // If we can't make any further progress on this thread, tell Inspector
2279    // that we're abandoning the work and will go find something else to do.
2280    if (ABANDON_EXECUTION == steal_result)
2281    {
2282        NOTIFY_ZC_INTRINSIC("cilk_sync_abandon", 0);
2283    }
2284#endif // defined ENABLE_NOTIFY_ZC_INTRINSIC
2285
2286    return; /* back to scheduler loop */
2287}
2288
2289/* worker W completely promotes its own deque, simulating the case
2290   where the whole deque is stolen.  We use this mechanism to force
2291   the allocation of new storage for reducers for race-detection
2292   purposes. */
2293void __cilkrts_promote_own_deque(__cilkrts_worker *w)
2294{
2295    // Remember the fiber we start this method on.
2296    CILK_ASSERT(w->l->frame_ff);
2297    cilk_fiber* starting_fiber = w->l->frame_ff->fiber_self;
2298
2299    BEGIN_WITH_WORKER_LOCK(w) {
2300        while (dekker_protocol(w)) {
2301            /* PLACEHOLDER_FIBER is used as non-null marker to tell detach()
2302               and make_child() that this frame should be treated as a spawn
2303               parent, even though we have not assigned it a stack. */
2304            detach_for_steal(w, w, PLACEHOLDER_FIBER);
2305        }
2306    } END_WITH_WORKER_LOCK(w);
2307
2308
2309    // TBD: The management of full frames and fibers is a bit
2310    // sketchy here.  We are promoting stack frames into full frames,
2311    // and pretending they are stolen away, but no other worker is
2312    // actually working on them.  Some runtime invariants
2313    // may be broken here.
2314    //
2315    // Technically, if we are simulating a steal from w
2316    // w should get a new full frame, but
2317    // keep the same fiber.  A real thief would be taking the
2318    // loot frame away, get a new fiber, and starting executing the
2319    // loot frame.
2320    //
2321    // What should a fake thief do?  Where does the frame go?
2322
2323    // In any case, we should be finishing the promotion process with
2324    // the same fiber with.
2325    CILK_ASSERT(w->l->frame_ff);
2326    CILK_ASSERT(w->l->frame_ff->fiber_self == starting_fiber);
2327}
2328
2329
2330
2331/* the client code calls this function after a spawn when the dekker
2332   protocol fails.  The function may either return or longjmp
2333   into the rts
2334
2335   This function takes in a "returning_sf" argument which corresponds
2336   to the __cilkrts_stack_frame that we are finishing (i.e., the
2337   argument to __cilkrts_leave_frame).
2338   */
2339void __cilkrts_c_THE_exception_check(__cilkrts_worker *w,
2340                                     __cilkrts_stack_frame *returning_sf)
2341{
2342    full_frame *ff;
2343    int stolen_p;
2344    __cilkrts_stack_frame *saved_sf = NULL;
2345
2346    START_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK);
2347
2348    BEGIN_WITH_WORKER_LOCK(w) {
2349        ff = w->l->frame_ff;
2350        CILK_ASSERT(ff);
2351        /* This code is called only upon a normal return and never
2352           upon an exceptional return.  Assert that this is the
2353           case. */
2354        CILK_ASSERT(!w->l->pending_exception);
2355
2356        reset_THE_exception(w);
2357        stolen_p = !(w->head < (w->tail + 1)); /* +1 because tail was
2358                                                  speculatively
2359                                                  decremented by the
2360                                                  compiled code */
2361
2362        if (stolen_p) {
2363            /* XXX This will be charged to THE for accounting purposes */
2364            __cilkrts_save_exception_state(w, ff);
2365
2366            // Save the value of the current stack frame.
2367            saved_sf = w->current_stack_frame;
2368
2369            // Reverse the decrement from undo_detach.
2370            // This update effectively resets the deque to be
2371            // empty (i.e., changes w->tail back to equal w->head).
2372            // We need to reset the deque to execute parallel
2373            // reductions.  When we have only serial reductions, it
2374            // does not matter, since serial reductions do not
2375            // change the deque.
2376            w->tail++;
2377#if REDPAR_DEBUG > 1
2378            // ASSERT our deque is empty.
2379            CILK_ASSERT(w->head == w->tail);
2380#endif
2381        }
2382    } END_WITH_WORKER_LOCK(w);
2383
2384    STOP_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK);
2385
2386    if (stolen_p)
2387    {
2388        w = execute_reductions_for_spawn_return(w, ff, returning_sf);
2389
2390        // "Mr. Policeman?  My parent always told me that if I was in trouble
2391        // I should ask a nice policeman for help.  I can't find my parent
2392        // anywhere..."
2393        //
2394        // Write a record to the replay log for an attempt to return to a stolen parent
2395        replay_record_orphaned(w);
2396
2397        // Update the pedigree only after we've finished the
2398        // reductions.
2399        update_pedigree_on_leave_frame(w, returning_sf);
2400
2401        // Notify Inspector that the parent has been stolen and we're
2402        // going to abandon this work and go do something else.  This
2403        // will match the cilk_leave_begin in the compiled code
2404        NOTIFY_ZC_INTRINSIC("cilk_leave_stolen", saved_sf);
2405
2406        DBGPRINTF ("%d: longjmp_into_runtime from __cilkrts_c_THE_exception_check\n", w->self);
2407        longjmp_into_runtime(w, do_return_from_spawn, 0);
2408        DBGPRINTF ("%d: returned from longjmp_into_runtime from __cilkrts_c_THE_exception_check?!\n", w->self);
2409    }
2410    else
2411    {
2412        NOTE_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK_USELESS);
2413        return;
2414    }
2415}
2416
2417/* Return an exception to a stolen parent. */
2418NORETURN __cilkrts_exception_from_spawn(__cilkrts_worker *w,
2419                                        __cilkrts_stack_frame *returning_sf)
2420{
2421    full_frame *ff = w->l->frame_ff;
2422    // This is almost the same as THE_exception_check, except
2423    // the detach didn't happen, we don't need to undo the tail
2424    // update.
2425    CILK_ASSERT(w->head == w->tail);
2426    w = execute_reductions_for_spawn_return(w, ff, returning_sf);
2427
2428    longjmp_into_runtime(w, do_return_from_spawn, 0);
2429    CILK_ASSERT(0);
2430}
2431
2432static void do_return_from_spawn(__cilkrts_worker *w,
2433                                 full_frame *ff,
2434                                 __cilkrts_stack_frame *sf)
2435{
2436    full_frame *parent_ff;
2437    enum provably_good_steal_t steal_result = ABANDON_EXECUTION;
2438
2439    BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) {
2440        CILK_ASSERT(ff);
2441        CILK_ASSERT(!ff->is_call_child);
2442        CILK_ASSERT(sf == NULL);
2443        parent_ff = ff->parent;
2444
2445        BEGIN_WITH_FRAME_LOCK(w, ff) {
2446            decjoin(ff);
2447        } END_WITH_FRAME_LOCK(w, ff);
2448
2449        BEGIN_WITH_FRAME_LOCK(w, parent_ff) {
2450            if (parent_ff->simulated_stolen)
2451                unconditional_steal(w, parent_ff);
2452            else
2453                steal_result = provably_good_steal(w, parent_ff);
2454        } END_WITH_FRAME_LOCK(w, parent_ff);
2455
2456    } END_WITH_WORKER_LOCK_OPTIONAL(w);
2457
2458    // Loop here in replay mode
2459#ifdef CILK_RECORD_REPLAY
2460    // We don't have to explicitly check for REPLAY_LOG below because
2461    // steal_result can only get set to WAIT_FOR_CONTINUE during replay.
2462    // We also don't have to worry about the simulated_stolen flag
2463    // because steal_result can only be set to WAIT_FOR_CONTINUE by
2464    // provably_good_steal().
2465    while(WAIT_FOR_CONTINUE == steal_result)
2466    {
2467        __cilkrts_sleep();
2468        BEGIN_WITH_WORKER_LOCK_OPTIONAL(w)
2469        {
2470            BEGIN_WITH_FRAME_LOCK(w, parent_ff)
2471            {
2472                steal_result = provably_good_steal(w, parent_ff);
2473            } END_WITH_FRAME_LOCK(w, parent_ff);
2474        } END_WITH_WORKER_LOCK_OPTIONAL(w);
2475    }
2476#endif  // CILK_RECORD_REPLAY
2477
2478    // Cleanup the child frame.
2479    __cilkrts_destroy_full_frame(w, ff);
2480    return;
2481}
2482
2483#ifdef _WIN32
2484/* migrate an exception across fibers.  Call this function when an exception has
2485 * been thrown and has to traverse across a steal.  The exception has already
2486 * been wrapped up, so all that remains is to longjmp() into the continuation,
2487 * sync, and re-raise it.
2488 */
2489void __cilkrts_migrate_exception(__cilkrts_stack_frame *sf) {
2490
2491    __cilkrts_worker *w = sf->worker;
2492    full_frame *ff;
2493
2494    BEGIN_WITH_WORKER_LOCK(w) {
2495        ff = w->l->frame_ff;
2496        reset_THE_exception(w);
2497        /* there is no need to check for a steal because we wouldn't be here if
2498           there weren't a steal. */
2499        __cilkrts_save_exception_state(w, ff);
2500
2501        CILK_ASSERT(w->head == w->tail);
2502    } END_WITH_WORKER_LOCK(w);
2503
2504    {
2505        // TBD(jsukha): This function emulates the
2506        // the "do_return_from_spawn" path.
2507        w = execute_reductions_for_spawn_return(w, ff, sf);
2508    }
2509
2510    longjmp_into_runtime(w, do_return_from_spawn, 0); /* does not return. */
2511    CILK_ASSERT(! "Shouldn't be here...");
2512}
2513#endif
2514
2515
2516/* Pop a call stack from TAIL.  Return the call stack, or NULL if the
2517   queue is empty */
2518__cilkrts_stack_frame *__cilkrts_pop_tail(__cilkrts_worker *w)
2519{
2520    __cilkrts_stack_frame *sf;
2521    BEGIN_WITH_WORKER_LOCK(w) {
2522        __cilkrts_stack_frame *volatile *tail = w->tail;
2523        if (w->head < tail) {
2524            --tail;
2525            sf = *tail;
2526            w->tail = tail;
2527        } else {
2528            sf = 0;
2529        }
2530    } END_WITH_WORKER_LOCK(w);
2531    return sf;
2532}
2533
2534#ifdef CILK_RECORD_REPLAY
2535__cilkrts_stack_frame *simulate_pop_tail(__cilkrts_worker *w)
2536{
2537    __cilkrts_stack_frame *sf;
2538    BEGIN_WITH_WORKER_LOCK(w) {
2539        if (w->head < w->tail) {
2540            sf = *(w->tail-1);
2541        } else {
2542            sf = 0;
2543        }
2544    } END_WITH_WORKER_LOCK(w);
2545    return sf;
2546}
2547#endif
2548
2549
2550/* Return from a call, not a spawn. */
2551void __cilkrts_return(__cilkrts_worker *w)
2552{
2553    full_frame *ff, *parent_ff;
2554    START_INTERVAL(w, INTERVAL_RETURNING);
2555
2556    BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) {
2557        ff = w->l->frame_ff;
2558        CILK_ASSERT(ff);
2559        CILK_ASSERT(ff->join_counter == 1);
2560        /* This path is not used to return from spawn. */
2561        CILK_ASSERT(ff->is_call_child);
2562
2563        BEGIN_WITH_FRAME_LOCK(w, ff) {
2564            // After this call, w->l->frame_ff != ff.
2565            // Technically, w will "own" ff until ff is freed,
2566            // however, because ff is a dying leaf full frame.
2567            parent_ff = disown(w, ff, 0, "return");
2568            decjoin(ff);
2569
2570#ifdef _WIN32
2571            __cilkrts_save_exception_state(w, ff);
2572#else
2573            // Move the pending exceptions into the full frame
2574            // This should always be NULL if this isn't a
2575            // return with an exception
2576            CILK_ASSERT(NULL == ff->pending_exception);
2577            ff->pending_exception = w->l->pending_exception;
2578            w->l->pending_exception = NULL;
2579#endif  // _WIN32
2580
2581        } END_WITH_FRAME_LOCK(w, ff);
2582
2583        __cilkrts_fence(); /* redundant */
2584
2585        CILK_ASSERT(parent_ff);
2586
2587        BEGIN_WITH_FRAME_LOCK(w, parent_ff) {
2588            finalize_child_for_call(w, parent_ff, ff);
2589        } END_WITH_FRAME_LOCK(w, parent_ff);
2590
2591        ff = pop_next_frame(w);
2592        /* ff will be non-null except when the parent frame is owned
2593           by another worker.
2594           CILK_ASSERT(ff)
2595        */
2596        CILK_ASSERT(!w->l->frame_ff);
2597        if (ff) {
2598            BEGIN_WITH_FRAME_LOCK(w, ff) {
2599                __cilkrts_stack_frame *sf = ff->call_stack;
2600                CILK_ASSERT(sf && !sf->call_parent);
2601                setup_for_execution(w, ff, 1);
2602            } END_WITH_FRAME_LOCK(w, ff);
2603        }
2604    } END_WITH_WORKER_LOCK_OPTIONAL(w);
2605
2606    STOP_INTERVAL(w, INTERVAL_RETURNING);
2607}
2608
2609static void __cilkrts_unbind_thread()
2610{
2611    int stop_cilkscreen = 0;
2612    global_state_t *g;
2613
2614    // Take out the global OS mutex to protect accesses to the table of workers
2615    global_os_mutex_lock();
2616
2617    if (cilkg_is_published()) {
2618        __cilkrts_worker *w = __cilkrts_get_tls_worker();
2619        if (w) {
2620            g = w->g;
2621
2622            // If there's only 1 worker, the counts will be stopped in
2623            // __cilkrts_scheduler
2624            if (g->P > 1)
2625            {
2626                STOP_INTERVAL(w, INTERVAL_WORKING);
2627                STOP_INTERVAL(w, INTERVAL_IN_SCHEDULER);
2628            }
2629
2630            __cilkrts_set_tls_worker(0);
2631
2632            if (w->self == -1) {
2633                // This worker is an overflow worker.  I.e., it was created on-
2634                // demand when the global pool ran out of workers.
2635                destroy_worker(w);
2636                __cilkrts_free(w);
2637            } else {
2638                // This is a normal user worker and needs to be counted by the
2639                // global state for the purposes of throttling system workers.
2640                w->l->type = WORKER_FREE;
2641                __cilkrts_leave_cilk(g);
2642            }
2643
2644            stop_cilkscreen = (0 == g->Q);
2645        }
2646    }
2647    global_os_mutex_unlock();
2648
2649    /* Turn off Cilkscreen.  This needs to be done when we are NOT holding the
2650     * os mutex. */
2651    if (stop_cilkscreen)
2652        __cilkrts_cilkscreen_disable_instrumentation();
2653}
2654
2655/* special return from the initial frame */
2656
2657void __cilkrts_c_return_from_initial(__cilkrts_worker *w)
2658{
2659    struct cilkred_map *rm;
2660
2661    /* This is only called on a user thread worker. */
2662    CILK_ASSERT(w->l->type == WORKER_USER);
2663
2664    #if REDPAR_DEBUG >= 3
2665    fprintf(stderr, "[W=%d, desc=cilkrts_c_return_from_initial, ff=%p]\n",
2666            w->self, w->l->frame_ff);
2667    #endif
2668
2669    BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) {
2670        full_frame *ff = w->l->frame_ff;
2671        CILK_ASSERT(ff);
2672        CILK_ASSERT(ff->join_counter == 1);
2673        w->l->frame_ff = 0;
2674
2675        CILK_ASSERT(ff->fiber_self);
2676        // Save any TBB interop data for the next time this thread enters Cilk
2677        cilk_fiber_tbb_interop_save_info_from_stack(ff->fiber_self);
2678
2679        // Deallocate cilk_fiber that mapped to the user stack.  The stack
2680        // itself does not get deallocated (of course) but our data
2681        // structure becomes divorced from it.
2682
2683#if FIBER_DEBUG >= 1
2684        fprintf(stderr, "ThreadId=%p: w=%d: We are about to deallocate ff->fiber_self  = %p here. w->l->scheduling_fiber = %p. w->l->type = %d\n",
2685                cilkos_get_current_thread_id(),
2686                w->self,
2687                ff->fiber_self,
2688                w->l->scheduling_fiber,
2689                w->l->type);
2690#endif
2691        // The fiber in ff is a user-code fiber.  The fiber in
2692        // w->l->scheduling_fiber is a scheduling fiber.  These fibers should
2693        // never be equal.  When a user worker returns (and will unbind), we
2694        // should destroy only the fiber in ff.  The scheduling fiber will be
2695        // re-used.
2696
2697        CILK_ASSERT(ff->fiber_self != w->l->scheduling_fiber);
2698
2699        START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) {
2700            // This fiber might not be deallocated here if there
2701            // is a pending exception on Windows that refers
2702            // to this fiber.
2703            //
2704            // First "suspend" the fiber, and then try to delete it.
2705            cilk_fiber_deallocate_from_thread(ff->fiber_self);
2706        } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE);
2707        ff->fiber_self = NULL;
2708
2709        /* Save reducer map into global_state object */
2710        rm = w->reducer_map;
2711        w->reducer_map = NULL;
2712
2713#if REDPAR_DEBUG >= 3
2714        fprintf(stderr, "W=%d, reducer_map_to_delete=%p, was in ff=%p\n",
2715                w->self,
2716                rm,
2717                ff);
2718#endif
2719        __cilkrts_destroy_full_frame(w, ff);
2720
2721
2722        /* Work is never done. w->g->work_done = 1; __cilkrts_fence(); */
2723    } END_WITH_WORKER_LOCK_OPTIONAL(w);
2724
2725
2726    save_pedigree_leaf_from_user_worker(w);
2727
2728    // Workers can have NULL reducer maps now.
2729    if (rm) {
2730        __cilkrts_destroy_reducer_map(w, rm);
2731    }
2732
2733
2734#if FIBER_DEBUG >= 1
2735    __cilkrts_worker* tmp = w;
2736    int tmp_id = w->self;
2737    fprintf(stderr, "w=%d: We are about unbind thread (w= %p)\n",
2738            w->self,
2739            w);
2740#endif
2741
2742    w = NULL;
2743
2744    __cilkrts_unbind_thread();
2745
2746#if FIBER_DEBUG >= 1
2747
2748    fprintf(stderr, "w=%p, %d: Finished unbind\n",
2749            tmp, tmp_id);
2750#endif
2751
2752    /* Other workers will stop trying to steal if this was the last worker. */
2753
2754    return;
2755}
2756
2757
2758/*
2759 * __cilkrts_restore_stealing
2760 *
2761 * Restore the protected_tail to a previous state, possibly allowing frames
2762 * to be stolen.  The dekker_protocol has been extended to steal only if
2763 * head+1 is < protected_tail.
2764 */
2765
2766void __cilkrts_restore_stealing(
2767    __cilkrts_worker *w,
2768    __cilkrts_stack_frame *volatile *saved_protected_tail)
2769{
2770    /* On most x86 this pair of operations would be slightly faster
2771       as an atomic exchange due to the implicit memory barrier in
2772       an atomic instruction. */
2773    w->protected_tail = saved_protected_tail;
2774    __cilkrts_fence();
2775}
2776
2777/*
2778 * __cilkrts_disallow_stealing
2779 *
2780 * Move the protected_tail to NEW_PROTECTED_TAIL, preventing any
2781 * frames from being stolen.  If NEW_PROTECTED_TAIL is NULL, prevent
2782 * stealing from the whole queue.  The dekker_protocol has been
2783 * extended to only steal if head+1 is also < protected_tail.
2784 */
2785
2786__cilkrts_stack_frame *volatile *__cilkrts_disallow_stealing(
2787    __cilkrts_worker *w,
2788    __cilkrts_stack_frame *volatile *new_protected_tail)
2789{
2790    __cilkrts_stack_frame *volatile *saved_protected_tail = w->protected_tail;
2791
2792    if (!new_protected_tail)
2793        new_protected_tail = w->l->ltq;
2794
2795    if (w->protected_tail > new_protected_tail) {
2796        w->protected_tail = new_protected_tail;
2797        /* Issue a store-store barrier.  The update to protected_tail
2798           here must precede the update to tail in the next spawn.
2799           On x86 this is probably not needed. */
2800#if defined __GNUC__ && __ICC >= 1200 && !(__MIC__ ||__MIC2__)
2801        _mm_sfence();
2802#else
2803        __cilkrts_fence();
2804#endif
2805    }
2806
2807    return saved_protected_tail;
2808}
2809
2810/*************************************************************
2811  Initialization and startup
2812*************************************************************/
2813
2814__cilkrts_worker *make_worker(global_state_t *g,
2815                              int self, __cilkrts_worker *w)
2816{
2817    w->self = self;
2818    w->g = g;
2819
2820    w->pedigree.rank = 0;    // Initial rank is 0
2821    w->pedigree.parent = NULL;
2822
2823    w->l = (local_state *)__cilkrts_malloc(sizeof(*w->l));
2824
2825    __cilkrts_frame_malloc_per_worker_init(w);
2826
2827    w->reducer_map = NULL;
2828    w->current_stack_frame = NULL;
2829    w->reserved = NULL;
2830
2831    w->l->worker_magic_0 = WORKER_MAGIC_0;
2832    w->l->team = NULL;
2833    w->l->type = WORKER_FREE;
2834
2835    __cilkrts_mutex_init(&w->l->lock);
2836    __cilkrts_mutex_init(&w->l->steal_lock);
2837    w->l->do_not_steal = 0;
2838    w->l->frame_ff = 0;
2839    w->l->next_frame_ff = 0;
2840    w->l->last_full_frame = NULL;
2841
2842    w->l->ltq = (__cilkrts_stack_frame **)
2843        __cilkrts_malloc(g->ltqsize * sizeof(*w->l->ltq));
2844    w->ltq_limit = w->l->ltq + g->ltqsize;
2845    w->head = w->tail = w->l->ltq;
2846
2847    cilk_fiber_pool_init(&w->l->fiber_pool,
2848                         &g->fiber_pool,
2849                         g->stack_size,
2850                         g->fiber_pool_size,
2851                         0,   // alloc_max is 0.  We don't allocate from the heap directly without checking the parent pool.
2852                         0);
2853#if FIBER_DEBUG >= 2
2854    fprintf(stderr, "ThreadId=%p: Making w=%d (%p), pool = %p\n",
2855            cilkos_get_current_thread_id(),
2856            w->self, w,
2857            &w->l->fiber_pool);
2858#endif
2859    w->l->scheduling_fiber = NULL;
2860    w->l->original_pedigree_leaf = NULL;
2861    w->l->rand_seed = 0; /* the scheduler will overwrite this field */
2862
2863    w->l->post_suspend = 0;
2864    w->l->suspended_stack = 0;
2865    w->l->fiber_to_free = NULL;
2866    w->l->pending_exception = NULL;
2867
2868#if CILK_PROFILE
2869    w->l->stats = __cilkrts_malloc(sizeof(statistics));
2870    __cilkrts_init_stats(w->l->stats);
2871#else
2872    w->l->stats = NULL;
2873#endif
2874    w->l->steal_failure_count = 0;
2875
2876    w->l->work_stolen = 0;
2877
2878    // Initialize record/replay assuming we're doing neither
2879    w->l->record_replay_fptr = NULL;
2880    w->l->replay_list_root = NULL;
2881    w->l->replay_list_entry = NULL;
2882    w->l->signal_node = NULL;
2883    // Nothing's been stolen yet
2884    w->l->worker_magic_1 = WORKER_MAGIC_1;
2885
2886    /*w->parallelism_disabled = 0;*/
2887
2888    // Allow stealing all frames. Sets w->saved_protected_tail
2889    __cilkrts_restore_stealing(w, w->ltq_limit);
2890
2891    __cilkrts_init_worker_sysdep(w);
2892
2893    reset_THE_exception(w);
2894
2895    return w;
2896}
2897
2898void destroy_worker(__cilkrts_worker *w)
2899{
2900    CILK_ASSERT (NULL == w->l->pending_exception);
2901
2902    // Deallocate the scheduling fiber
2903    if (NULL != w->l->scheduling_fiber)
2904    {
2905        // The scheduling fiber is the main fiber for system workers and must
2906        // be deallocated by the thread that created it.  Thus, we can
2907        // deallocate only free workers' (formerly user workers) scheduling
2908        // fibers here.
2909        CILK_ASSERT(WORKER_FREE == w->l->type);
2910
2911#if FIBER_DEBUG >=1
2912        fprintf(stderr, "ThreadId=%p, w=%p, %d, deallocating scheduling fiber = %p, \n",
2913                cilkos_get_current_thread_id(),
2914                w,
2915                w->self,
2916                w->l->scheduling_fiber);
2917#endif
2918        int ref_count = cilk_fiber_remove_reference(w->l->scheduling_fiber, NULL);
2919        // Scheduling fiber should never have extra references because of exceptions.
2920        CILK_ASSERT(0 == ref_count);
2921        w->l->scheduling_fiber = NULL;
2922    }
2923
2924#if CILK_PROFILE
2925    if (w->l->stats) {
2926        __cilkrts_free(w->l->stats);
2927    }
2928#else
2929    CILK_ASSERT(NULL == w->l->stats);
2930#endif
2931
2932    /* Free any cached fibers. */
2933    cilk_fiber_pool_destroy(&w->l->fiber_pool);
2934
2935    __cilkrts_destroy_worker_sysdep(w);
2936
2937    if (w->l->signal_node) {
2938        CILK_ASSERT(WORKER_SYSTEM == w->l->type);
2939        signal_node_destroy(w->l->signal_node);
2940    }
2941
2942    __cilkrts_free(w->l->ltq);
2943    __cilkrts_mutex_destroy(0, &w->l->lock);
2944    __cilkrts_mutex_destroy(0, &w->l->steal_lock);
2945    __cilkrts_frame_malloc_per_worker_cleanup(w);
2946
2947    __cilkrts_free(w->l);
2948
2949    // The caller is responsible for freeing the worker memory
2950}
2951
2952/*
2953 * Make a worker into a system worker.
2954 */
2955static void make_worker_system(__cilkrts_worker *w) {
2956    CILK_ASSERT(WORKER_FREE == w->l->type);
2957    w->l->type = WORKER_SYSTEM;
2958    w->l->signal_node = signal_node_create();
2959}
2960
2961void __cilkrts_deinit_internal(global_state_t *g)
2962{
2963    int i;
2964    __cilkrts_worker *w;
2965
2966    // If there's no global state then we're done
2967    if (NULL == g)
2968        return;
2969
2970#ifdef CILK_PROFILE
2971    __cilkrts_dump_stats_to_stderr(g);
2972#endif
2973
2974    w = g->workers[0];
2975    if (w->l->frame_ff) {
2976        __cilkrts_destroy_full_frame(w, w->l->frame_ff);
2977        w->l->frame_ff = 0;
2978    }
2979
2980    // Release any resources used for record/replay
2981    replay_term(g);
2982
2983    // Destroy any system dependent global state
2984    __cilkrts_destroy_global_sysdep(g);
2985
2986    for (i = 0; i < g->total_workers; ++i)
2987        destroy_worker(g->workers[i]);
2988
2989    // Free memory for all worker blocks which were allocated contiguously
2990    __cilkrts_free(g->workers[0]);
2991
2992    __cilkrts_free(g->workers);
2993
2994    cilk_fiber_pool_destroy(&g->fiber_pool);
2995    __cilkrts_frame_malloc_global_cleanup(g);
2996
2997    cilkg_deinit_global_state();
2998}
2999
3000/*
3001 * Wake the runtime by notifying the system workers that they can steal.  The
3002 * first user worker into the runtime should call this.
3003 */
3004static void wake_runtime(global_state_t *g)
3005{
3006    __cilkrts_worker *root;
3007    if (g->P > 1) {
3008        // Send a message to the root node.  The message will propagate.
3009        root = g->workers[0];
3010        CILK_ASSERT(root->l->signal_node);
3011        signal_node_msg(root->l->signal_node, 1);
3012    }
3013}
3014
3015/*
3016 * Put the runtime to sleep.  The last user worker out of the runtime should
3017 * call this.  Like Dad always said, turn out the lights when nobody's in the
3018 * room.
3019 */
3020static void sleep_runtime(global_state_t *g)
3021{
3022    __cilkrts_worker *root;
3023    if (g->P > 1) {
3024        // Send a message to the root node.  The message will propagate.
3025        root = g->workers[0];
3026        CILK_ASSERT(root->l->signal_node);
3027        signal_node_msg(root->l->signal_node, 0);
3028    }
3029}
3030
3031/* Called when a user thread joins Cilk.
3032   Global lock must be held. */
3033void __cilkrts_enter_cilk(global_state_t *g)
3034{
3035    if (g->Q++ == 0) {
3036        // If this is the first user thread to enter Cilk wake
3037        // up all the workers.
3038        wake_runtime(g);
3039    }
3040}
3041
3042/* Called when a user thread leaves Cilk.
3043   Global lock must be held. */
3044void __cilkrts_leave_cilk(global_state_t *g)
3045{
3046    if (--g->Q == 0) {
3047        // Put the runtime to sleep.
3048        sleep_runtime(g);
3049    }
3050}
3051
3052/*
3053 * worker_runnable
3054 *
3055 * Return true if the worker should continue to try to steal.  False, otherwise.
3056 */
3057
3058NOINLINE
3059static enum schedule_t worker_runnable(__cilkrts_worker *w)
3060{
3061    global_state_t *g = w->g;
3062
3063    /* If this worker has something to do, do it.
3064       Otherwise the work would be lost. */
3065    if (w->l->next_frame_ff)
3066        return SCHEDULE_RUN;
3067
3068    // If Cilk has explicitly (by the user) been told to exit (i.e., by
3069    // __cilkrts_end_cilk() -> __cilkrts_stop_workers(g)), then return 0.
3070    if (g->work_done)
3071        return SCHEDULE_EXIT;
3072
3073    if (0 == w->self) {
3074        // This worker is the root node and is the only one that may query the
3075        // global state to see if there are still any user workers in Cilk.
3076        if (w->l->steal_failure_count > g->max_steal_failures) {
3077            if (signal_node_should_wait(w->l->signal_node)) {
3078                return SCHEDULE_WAIT;
3079            } else {
3080                // Reset the steal_failure_count since we have verified that
3081                // user workers are still in Cilk.
3082                w->l->steal_failure_count = 0;
3083            }
3084        }
3085    } else if (WORKER_SYSTEM == w->l->type &&
3086               signal_node_should_wait(w->l->signal_node)) {
3087        // This worker has been notified by its parent that it should stop
3088        // trying to steal.
3089        return SCHEDULE_WAIT;
3090    }
3091
3092    return SCHEDULE_RUN;
3093}
3094
3095
3096
3097// Initialize the worker structs, but don't start the workers themselves.
3098static void init_workers(global_state_t *g)
3099{
3100    int total_workers = g->total_workers;
3101    int i;
3102    struct CILK_ALIGNAS(256) buffered_worker {
3103        __cilkrts_worker w;
3104        char buf[64];
3105    } *workers_memory;
3106
3107    /* not needed if only one worker */
3108    cilk_fiber_pool_init(&g->fiber_pool,
3109                         NULL,
3110                         g->stack_size,
3111                         g->global_fiber_pool_size,           // buffer_size
3112                         g->max_stacks,                       // maximum # to allocate
3113                         1);
3114
3115    cilk_fiber_pool_set_fiber_limit(&g->fiber_pool,
3116                                    (g->max_stacks ? g->max_stacks : INT_MAX));
3117
3118    g->workers = (__cilkrts_worker **)
3119        __cilkrts_malloc(total_workers * sizeof(*g->workers));
3120
3121    // Allocate 1 block of memory for workers to make life easier for tools
3122    // like Inspector which run multithreaded and need to know the memory
3123    // range for all the workers that will be accessed in a user's program
3124    workers_memory = (struct buffered_worker*)
3125        __cilkrts_malloc(sizeof(*workers_memory) * total_workers);
3126
3127    // Notify any tools that care (Cilkscreen and Inspector) that they should
3128    // ignore memory allocated for the workers
3129    __cilkrts_cilkscreen_ignore_block(&workers_memory[0],
3130                                      &workers_memory[total_workers]);
3131
3132    // Initialize worker structs, including unused worker slots.
3133    for (i = 0; i < total_workers; ++i) {
3134        g->workers[i] = make_worker(g, i, &workers_memory[i].w);
3135    }
3136
3137    // Set the workers in the first P - 1 slots to be system workers.
3138    // Remaining worker structs already have type == 0.
3139    for (i = 0; i < g->system_workers; ++i) {
3140        make_worker_system(g->workers[i]);
3141    }
3142}
3143
3144void __cilkrts_init_internal(int start)
3145{
3146    global_state_t *g = NULL;
3147
3148    if (cilkg_is_published()) {
3149        g = cilkg_init_global_state();
3150    }
3151    else {
3152
3153        // We think the state has not been published yet.
3154        // Grab the lock and try to initialize/publish.
3155        global_os_mutex_lock();
3156
3157        if (cilkg_is_published()) {
3158            // Some other thread must have snuck in and published.
3159            g = cilkg_init_global_state();
3160        }
3161        else {
3162            // Initialize and retrieve global state
3163            g = cilkg_init_global_state();
3164
3165            // Set the scheduler pointer
3166            g->scheduler = worker_scheduler_function;
3167
3168            // If we're running under a sequential P-Tool (Cilkscreen or
3169            // Cilkview) then there's only one worker and we need to tell
3170            // the tool about the extent of the stack
3171            if (g->under_ptool)
3172                __cilkrts_establish_c_stack();
3173            init_workers(g);
3174
3175            // Initialize per-work record/replay logging
3176            replay_init_workers(g);
3177
3178            // Initialize any system dependent global state
3179            __cilkrts_init_global_sysdep(g);
3180
3181
3182            cilkg_publish_global_state(g);
3183        }
3184
3185        global_os_mutex_unlock();
3186    }
3187
3188    CILK_ASSERT(g);
3189
3190    if (start && !g->workers_running)
3191    {
3192        // Acquire the global OS mutex while we're starting the workers
3193        global_os_mutex_lock();
3194        if (!g->workers_running)
3195            // Start P - 1 system workers since P includes the first user
3196            // worker.
3197            __cilkrts_start_workers(g, g->P - 1);
3198        global_os_mutex_unlock();
3199    }
3200}
3201
3202
3203/************************************************************************
3204  Methods for reducer protocol.
3205
3206  Reductions occur in two places:
3207    A. A full frame "ff" is returning from a spawn with a stolen parent.
3208    B. A full frame "ff" is stalling at a sync.
3209
3210  To support parallel reductions, reduction functions need to be
3211  executed while control is on a user stack, before jumping into the
3212  runtime.  These reductions can not occur while holding a worker or
3213  frame lock.
3214
3215  Before a worker w executes a reduction in either Case A or B, w's
3216  deque is empty.
3217
3218  Since parallel reductions push work onto the deque, we must do extra
3219  work to set up runtime data structures properly before reductions
3220  begin to allow stealing.  ( Normally, when we have only serial
3221  reductions, once a worker w starts a reduction, its deque remains
3222  empty until w either steals another frame or resumes a suspended
3223  frame.  Thus, we don't care about the state of the deque, since w
3224  will reset its deque when setting up execution of a frame. )
3225
3226  To allow for parallel reductions, we coerce the runtime data
3227  structures so that, from their perspective, it looks as though we
3228  have spliced in an "execute_reductions()" function.  Consider the
3229  two cases for reductions:
3230
3231    Case A: Return from a spawn with a stolen parent.
3232      Consider a spawned function g is returning on a worker w.
3233      Assume:
3234          -   g was spawned from a parent function f.
3235          -   ff is the full frame for g's spawn helper
3236          -   sf be the __cilkrts_stack_frame for g's spawn helper.
3237
3238      We are conceptually splicing "execute_reductions()" so that it
3239      occurs immediately before the spawn helper of g returns to f.
3240
3241      We do so by creating two different world views --- one for the
3242      runtime data structures, and one for the actual control flow.
3243
3244        - Before reductions begin, the runtime data structures should
3245          look as though the spawn helper of g is calling
3246          "execute_reductions()", in terms of both the user stack and
3247          worker deque.  More precisely, w should satisfy the
3248          following properties:
3249
3250              (a) w has ff as its full frame,
3251              (b) w has sf as its __cilkrts_stack_frame, and
3252              (c) w has an empty deque.
3253
3254          If the runtime satisfies these properties, then if w
3255          encounters a spawn in a parallel reduction, it can push onto
3256          a valid deque.  Also, when a steal from w occurs, it will
3257          build the correct tree of full frames when w is stolen from.
3258
3259        - In actual control flow, however, once the
3260          "execute_reductions()" function returns, it is actually
3261          returning to runtime code instead of g's spawn helper.
3262
3263          At the point a worker w began executing reductions, the
3264          control flow / compiled code had already finished g's spawn
3265          helper, and w was about to enter the runtime.  With parallel
3266          reductions, some worker v (which might be different from w)
3267          is the one returning to the runtime.
3268
3269
3270      The reduction logic consists of 4 steps:
3271
3272       A1. Restore runtime data structures to make it look as though
3273           the spawn helper of g() is still the currently executing
3274           frame for w.
3275
3276       A2. Execute reductions on the user stack.  Reductions also
3277           includes the logic for exceptions and stacks.  Note that
3278           reductions start on w, but may finish on a different
3279           worker if there is parallelism in the reduce.
3280
3281       A3. Splice out ff from the tree of full frames.
3282
3283       A4. Jump into the runtime/scheduling stack and execute
3284           "do_return_from_spawn".  This method
3285
3286           (a) Frees the user stack we were just on if it is no longer needed.
3287           (b) Decrement the join counter on ff->parent, and tries to do a
3288               provably good steal.
3289           (c) Clean up the full frame ff.
3290
3291
3292   Case B: Stalling at a sync.
3293
3294     Consider a function g(), with full frame ff and
3295     __cilkrts_stack_frame sf.  Suppose g() stalls at a sync, and we
3296     are executing reductions.
3297
3298     Conceptually, we are splicing in an "execute_reductions()"
3299     function into g() as the last action that g() takes immediately
3300     before it executes the cilk_sync.
3301
3302     The reduction logic for this case is similar to Case A.
3303
3304       B1. Restore the runtime data structures.
3305
3306           The main difference from Case A is that ff/sf is still a
3307           frame that needs to be executed later (since it is stalling
3308           at a cilk_sync).  Thus, we also need to save the current
3309           stack information into "ff" so that we can correctly resume
3310           execution of "ff" after the sync.
3311
3312       B2. Execute reductions on the user stack.
3313
3314       B3. No frame to splice out of the tree.
3315
3316       B4. Jump into the runtime/scheduling stack and execute "do_sync".
3317           This method:
3318           (a) Frees the user stack we were just on if it is no longer needed.
3319           (b) Tries to execute a provably good steal.
3320
3321  Finally, for the reducer protocol, we consider two reduction paths,
3322  namely a "fast" and "slow" path.  On a fast path, only trivial
3323  merges of reducer maps happen (i.e., one or both of the maps are
3324  NULL).  Otherwise, on the slow path, a reduction actually needs to
3325  happen.
3326
3327*****************************************************************/
3328
3329/**
3330 * @brief Locations to store the result of a reduction.
3331 *
3332 * Struct storing pointers to the fields in our "left" sibling that we
3333 * should update when splicing out a full frame or stalling at a sync.
3334 */
3335typedef struct {
3336    /** A pointer to the location of our left reducer map. */
3337    struct cilkred_map **map_ptr;
3338
3339    /** A pointer to the location of our left exception. */
3340    struct pending_exception_info **exception_ptr;
3341} splice_left_ptrs;
3342
3343/**
3344 * For a full frame returning from a spawn, calculate the pointers to
3345 * the maps and exceptions to my left.
3346 *
3347 * @param w   The currently executing worker.
3348 * @param ff  Full frame that is dying
3349 * @return    Pointers to our "left" for reducers and exceptions.
3350 */
3351static inline
3352splice_left_ptrs compute_left_ptrs_for_spawn_return(__cilkrts_worker *w,
3353                                                    full_frame *ff)
3354{
3355    // ASSERT: we hold the lock on ff->parent
3356
3357    splice_left_ptrs left_ptrs;
3358    if (ff->left_sibling) {
3359        left_ptrs.map_ptr = &ff->left_sibling->right_reducer_map;
3360        left_ptrs.exception_ptr = &ff->left_sibling->right_pending_exception;
3361    }
3362    else {
3363        full_frame *parent_ff = ff->parent;
3364        left_ptrs.map_ptr = &parent_ff->children_reducer_map;
3365        left_ptrs.exception_ptr = &parent_ff->child_pending_exception;
3366    }
3367    return left_ptrs;
3368}
3369
3370/**
3371 * For a full frame at a sync, calculate the pointers to the maps and
3372 * exceptions to my left.
3373 *
3374 * @param w   The currently executing worker.
3375 * @param ff  Full frame that is stalling at a sync.
3376 * @return    Pointers to our "left" for reducers and exceptions.
3377 */
3378static inline
3379splice_left_ptrs compute_left_ptrs_for_sync(__cilkrts_worker *w,
3380                                            full_frame *ff)
3381{
3382    // ASSERT: we hold the lock on ff
3383    splice_left_ptrs left_ptrs;
3384
3385    // Figure out which map to the left we should merge into.
3386    if (ff->rightmost_child) {
3387        CILK_ASSERT(ff->rightmost_child->parent == ff);
3388        left_ptrs.map_ptr = &(ff->rightmost_child->right_reducer_map);
3389        left_ptrs.exception_ptr = &(ff->rightmost_child->right_pending_exception);
3390    }
3391    else {
3392        // We have no children.  Then, we should be the last
3393        // worker at the sync... "left" is our child map.
3394        left_ptrs.map_ptr = &(ff->children_reducer_map);
3395        left_ptrs.exception_ptr = &(ff->child_pending_exception);
3396    }
3397    return left_ptrs;
3398}
3399
3400/**
3401 * After we have completed all reductions on a spawn return, call this
3402 * method to finish up before jumping into the runtime.
3403 *
3404 *   1. Perform the "reduction" on stacks, i.e., execute the left
3405 *      holder logic to pass the leftmost stack up.
3406 *
3407 *      w->l->fiber_to_free holds any stack that needs to be freed
3408 *      when control switches into the runtime fiber.
3409 *
3410 *   2. Unlink and remove child_ff from the tree of full frames.
3411 *
3412 * @param   w          The currently executing worker.
3413 * @param   parent_ff  The parent of child_ff.
3414 * @param   child_ff   The full frame returning from a spawn.
3415 */
3416static inline
3417void finish_spawn_return_on_user_stack(__cilkrts_worker *w,
3418                                       full_frame *parent_ff,
3419                                       full_frame *child_ff)
3420{
3421    CILK_ASSERT(w->l->fiber_to_free == NULL);
3422
3423    // Execute left-holder logic for stacks.
3424    if (child_ff->left_sibling || parent_ff->fiber_child) {
3425        // Case where we are not the leftmost stack.
3426        CILK_ASSERT(parent_ff->fiber_child != child_ff->fiber_self);
3427
3428        // Remember any fiber we need to free in the worker.
3429        // After we jump into the runtime, we will actually do the
3430        // free.
3431        w->l->fiber_to_free = child_ff->fiber_self;
3432    }
3433    else {
3434        // We are leftmost, pass stack/fiber up to parent.
3435        // Thus, no stack/fiber to free.
3436        parent_ff->fiber_child = child_ff->fiber_self;
3437        w->l->fiber_to_free = NULL;
3438    }
3439
3440    child_ff->fiber_self = NULL;
3441
3442    unlink_child(parent_ff, child_ff);
3443}
3444
3445
3446/**
3447 * Executes any fast reductions necessary to splice ff out of the tree
3448 * of full frames.
3449 *
3450 * This "fast" path performs only trivial merges of reducer maps,
3451 * i.e,. when one of them is NULL.
3452 * (See slow_path_reductions_for_spawn_return() for slow path.)
3453 *
3454 * Returns: 1 if we finished all reductions.
3455 * Returns: 0 if there are still reductions to execute, and
3456 *            we should execute the slow path.
3457 *
3458 * This method assumes w holds the frame lock on parent_ff.
3459 * After this method completes:
3460 *    1. We have spliced ff out of the tree of full frames.
3461 *    2. The reducer maps of child_ff have been deposited
3462 *       "left" according to the reducer protocol.
3463 *    3. w->l->stack_to_free stores the stack
3464 *       that needs to be freed once we jump into the runtime.
3465 *
3466 * We have not, however, decremented the join counter on ff->parent.
3467 * This prevents any other workers from resuming execution of the parent.
3468 *
3469 * @param   w    The currently executing worker.
3470 * @param   ff   The full frame returning from a spawn.
3471 * @return  NULL if we finished all reductions.
3472 * @return  The address where the left map is stored (which should be passed to
3473 *          slow_path_reductions_for_spawn_return()) if there are
3474 *          still reductions to execute.
3475 */
3476struct cilkred_map**
3477fast_path_reductions_for_spawn_return(__cilkrts_worker *w,
3478                                      full_frame *ff)
3479{
3480    // ASSERT: we hold ff->parent->lock.
3481    splice_left_ptrs left_ptrs;
3482
3483    CILK_ASSERT(NULL == w->l->pending_exception);
3484
3485    // Figure out the pointers to the left where I want
3486    // to put reducers and exceptions.
3487    left_ptrs = compute_left_ptrs_for_spawn_return(w, ff);
3488
3489    // Go ahead and merge exceptions while holding the lock.
3490    splice_exceptions_for_spawn(w, ff, left_ptrs.exception_ptr);
3491
3492    // Now check if we have any reductions to perform.
3493    //
3494    // Consider all the cases of left, middle and right maps.
3495    //  0. (-, -, -)  :  finish and return 1
3496    //  1. (L, -, -)  :  finish and return 1
3497    //  2. (-, M, -)  :  slide over to left, finish, and return 1.
3498    //  3. (L, M, -)  :  return 0
3499    //  4. (-, -, R)  :  slide over to left, finish, and return 1.
3500    //  5. (L, -, R)  :  return 0
3501    //  6. (-, M, R)  :  return 0
3502    //  7. (L, M, R)  :  return 0
3503    //
3504    // In terms of code:
3505    //  L == *left_ptrs.map_ptr
3506    //  M == w->reducer_map
3507    //  R == f->right_reducer_map.
3508    //
3509    // The goal of the code below is to execute the fast path with
3510    // as few branches and writes as possible.
3511
3512    int case_value = (*(left_ptrs.map_ptr) != NULL);
3513    case_value += ((w->reducer_map != NULL) << 1);
3514    case_value += ((ff->right_reducer_map != NULL) << 2);
3515
3516    // Fastest path is case_value == 0 or 1.
3517    if (case_value >=2) {
3518        switch (case_value) {
3519        case 2:
3520            *(left_ptrs.map_ptr) = w->reducer_map;
3521            w->reducer_map = NULL;
3522            return NULL;
3523            break;
3524        case 4:
3525            *(left_ptrs.map_ptr) = ff->right_reducer_map;
3526            ff->right_reducer_map = NULL;
3527            return NULL;
3528        default:
3529            // If we have to execute the slow path, then
3530            // return the pointer to the place to deposit the left
3531            // map.
3532            return left_ptrs.map_ptr;
3533        }
3534    }
3535
3536    // Do nothing
3537    return NULL;
3538}
3539
3540
3541/**
3542 * Executes any reductions necessary to splice "ff" frame out of
3543 * the steal tree.
3544 *
3545 * This method executes the "slow" path for reductions on a spawn
3546 * return, i.e., there are non-NULL maps that need to be merged
3547 * together.
3548 *
3549 * This method should execute only if
3550 * fast_path_reductions_for_spawn_return() returns a non-NULL
3551 * left_map_ptr.
3552 *
3553 * Upon entry, left_map_ptr should be the location of the left map
3554 * at the start of the reduction, as calculated by
3555 * fast_path_reductions_for_spawn_return().
3556 *
3557 * After this method completes:
3558 *    1. We have spliced ff out of the tree of full frames.
3559 *    2. The reducer maps of child_ff have been deposited
3560 *       "left" according to the reducer protocol.
3561 *    3. w->l->stack_to_free stores the stack
3562 *       that needs to be freed once we jump into the runtime.
3563 * We have not, however, decremented the join counter on ff->parent,
3564 * so no one can resume execution of the parent yet.
3565 *
3566 * WARNING:
3567 *   This method assumes the lock on ff->parent is held upon entry, and
3568 *   Upon exit, the worker that returns still holds a lock on ff->parent
3569 *   This method can, however, release and reacquire the lock on ff->parent.
3570 *
3571 * @param w             The currently executing worker.
3572 * @param ff            The full frame returning from a spawn.
3573 * @param left_map_ptr  Pointer to our initial left map.
3574 * @return              The worker that this method returns on.
3575 */
3576static __cilkrts_worker*
3577slow_path_reductions_for_spawn_return(__cilkrts_worker *w,
3578                                      full_frame *ff,
3579                                      struct cilkred_map **left_map_ptr)
3580{
3581
3582    // CILK_ASSERT: w is holding frame lock on parent_ff.
3583#if REDPAR_DEBUG > 0
3584    CILK_ASSERT(!ff->rightmost_child);
3585    CILK_ASSERT(!ff->is_call_child);
3586#endif
3587
3588    // Loop invariant:
3589    // When beginning this loop, we should
3590    //   1. Be holding the lock on ff->parent.
3591    //   2. left_map_ptr should be the address of the pointer to the left map.
3592    //   3. All maps should be slid over left by one, if possible.
3593    //   4. All exceptions should be merged so far.
3594    while (1) {
3595
3596        // Slide middle map left if possible.
3597        if (!(*left_map_ptr)) {
3598            *left_map_ptr = w->reducer_map;
3599            w->reducer_map = NULL;
3600        }
3601        // Slide right map to middle if possible.
3602        if (!w->reducer_map) {
3603            w->reducer_map = ff->right_reducer_map;
3604            ff->right_reducer_map = NULL;
3605        }
3606
3607        // Since we slid everything left by one,
3608        // we are finished if there is no middle map.
3609        if (!w->reducer_map) {
3610            verify_current_wkr(w);
3611            return w;
3612        }
3613        else {
3614            struct cilkred_map* left_map;
3615            struct cilkred_map* middle_map;
3616            struct cilkred_map* right_map;
3617
3618            // Take all the maps from their respective locations.
3619            // We can't leave them in place and execute a reduction because these fields
3620            // might change once we release the lock.
3621            left_map = *left_map_ptr;
3622            *left_map_ptr = NULL;
3623            middle_map = w->reducer_map;
3624            w->reducer_map = NULL;
3625            right_map = ff->right_reducer_map;
3626            ff->right_reducer_map = NULL;
3627
3628            // WARNING!!! Lock release here.
3629            // We have reductions to execute (and we can't hold locks).
3630            __cilkrts_frame_unlock(w, ff->parent);
3631
3632            // Merge all reducers into the left map.
3633            left_map = repeated_merge_reducer_maps(&w,
3634                                                   left_map,
3635                                                   middle_map);
3636            verify_current_wkr(w);
3637            left_map = repeated_merge_reducer_maps(&w,
3638                                                   left_map,
3639                                                   right_map);
3640            verify_current_wkr(w);
3641            CILK_ASSERT(NULL == w->reducer_map);
3642            // Put the final answer back into w->reducer_map.
3643            w->reducer_map = left_map;
3644
3645            // Save any exceptions generated because of the reduction
3646            // process from the returning worker.  These get merged
3647            // the next time around the loop.
3648            CILK_ASSERT(NULL == ff->pending_exception);
3649            ff->pending_exception = w->l->pending_exception;
3650            w->l->pending_exception = NULL;
3651
3652            // Lock ff->parent for the next loop around.
3653            __cilkrts_frame_lock(w, ff->parent);
3654
3655            // Once we have the lock again, recompute who is to our
3656            // left.
3657            splice_left_ptrs left_ptrs;
3658            left_ptrs = compute_left_ptrs_for_spawn_return(w, ff);
3659
3660            // Update the pointer for the left map.
3661            left_map_ptr = left_ptrs.map_ptr;
3662            // Splice the exceptions for spawn.
3663            splice_exceptions_for_spawn(w, ff, left_ptrs.exception_ptr);
3664        }
3665    }
3666    // We should never break out of this loop.
3667
3668    CILK_ASSERT(0);
3669    return NULL;
3670}
3671
3672
3673
3674/**
3675 * Execute reductions when returning from a spawn whose parent has
3676 * been stolen.
3677 *
3678 * Execution may start on w, but may finish on a different worker.
3679 * This method acquires/releases the lock on ff->parent.
3680 *
3681 * @param w            The currently executing worker.
3682 * @param ff           The full frame of the spawned function that is returning.
3683 * @param returning_sf The __cilkrts_stack_frame for this returning function.
3684 * @return             The worker returning from this method.
3685 */
3686static __cilkrts_worker*
3687execute_reductions_for_spawn_return(__cilkrts_worker *w,
3688                                    full_frame *ff,
3689                                    __cilkrts_stack_frame *returning_sf)
3690{
3691    // Step A1 from reducer protocol described above.
3692    //
3693    // Coerce the runtime into thinking that
3694    // ff/returning_sf are still on the bottom of
3695    // w's deque.
3696    restore_frame_for_spawn_return_reduction(w, ff, returning_sf);
3697
3698    // Step A2 and A3: Execute reductions on user stack.
3699    BEGIN_WITH_FRAME_LOCK(w, ff->parent) {
3700        struct cilkred_map **left_map_ptr;
3701        left_map_ptr = fast_path_reductions_for_spawn_return(w, ff);
3702
3703        // Pointer will be non-NULL if there are
3704        // still reductions to execute.
3705        if (left_map_ptr) {
3706            // WARNING: This method call may release the lock
3707            // on ff->parent and re-acquire it (possibly on a
3708            // different worker).
3709            // We can't hold locks while actually executing
3710            // reduce functions.
3711            w = slow_path_reductions_for_spawn_return(w,
3712                                                      ff,
3713                                                      left_map_ptr);
3714            verify_current_wkr(w);
3715        }
3716
3717        finish_spawn_return_on_user_stack(w, ff->parent, ff);
3718        // WARNING: the use of this lock macro is deceptive.
3719        // The worker may have changed here.
3720    } END_WITH_FRAME_LOCK(w, ff->parent);
3721    return w;
3722}
3723
3724
3725
3726/**
3727 * Execute fast "reductions" when ff stalls at a sync.
3728 *
3729 * @param   w  The currently executing worker.
3730 * @param   ff The full frame stalling at a sync.
3731 * @return  1 if we are finished with all reductions after calling this method.
3732 * @return  0 if we still need to execute the slow path reductions.
3733 */
3734static inline
3735int fast_path_reductions_for_sync(__cilkrts_worker *w,
3736                                  full_frame *ff) {
3737    // Return 0 if there is some reduction that needs to happen.
3738    return !(w->reducer_map  || ff->pending_exception);
3739}
3740
3741/**
3742 * Executes slow reductions when ff stalls at a sync.
3743 * This method should execute only if
3744 *   fast_path_reductions_for_sync(w, ff) returned 0.
3745 *
3746 * After this method completes:
3747 *   1. ff's current reducer map has been deposited into
3748 *       right_reducer_map of ff's rightmost child, or
3749 *       ff->children_reducer_map if ff has no children.
3750 *   2. Similarly for ff's current exception.
3751 *   3. Nothing to calculate for stacks --- if we are stalling
3752 *      we will always free a stack.
3753 *
3754 * This method may repeatedly acquire/release the lock on ff.
3755 *
3756 * @param   w  The currently executing worker.
3757 * @param   ff The full frame stalling at a sync.
3758 * @return  The worker returning from this method.
3759 */
3760static __cilkrts_worker*
3761slow_path_reductions_for_sync(__cilkrts_worker *w,
3762                              full_frame *ff)
3763{
3764    struct cilkred_map *left_map;
3765    struct cilkred_map *middle_map;
3766
3767#if (REDPAR_DEBUG > 0)
3768    CILK_ASSERT(ff);
3769    CILK_ASSERT(w->head == w->tail);
3770#endif
3771
3772    middle_map = w->reducer_map;
3773    w->reducer_map = NULL;
3774
3775    // Loop invariant: middle_map should be valid (the current map to reduce).
3776    //                 left_map is junk.
3777    //                 w->reducer_map == NULL.
3778    while (1) {
3779        BEGIN_WITH_FRAME_LOCK(w, ff) {
3780            splice_left_ptrs left_ptrs = compute_left_ptrs_for_sync(w, ff);
3781
3782            // Grab the "left" map and store pointers to those locations.
3783            left_map = *(left_ptrs.map_ptr);
3784            *(left_ptrs.map_ptr) = NULL;
3785
3786            // Slide the maps in our struct left as far as possible.
3787            if (!left_map) {
3788                left_map = middle_map;
3789                middle_map = NULL;
3790            }
3791
3792            *(left_ptrs.exception_ptr) =
3793                __cilkrts_merge_pending_exceptions(w,
3794                                                   *left_ptrs.exception_ptr,
3795                                                   ff->pending_exception);
3796            ff->pending_exception = NULL;
3797
3798            // If there is no middle map, then we are done.
3799            // Deposit left and return.
3800            if (!middle_map) {
3801                *(left_ptrs).map_ptr = left_map;
3802                #if (REDPAR_DEBUG > 0)
3803                CILK_ASSERT(NULL == w->reducer_map);
3804                #endif
3805                // Sanity check upon leaving the loop.
3806                verify_current_wkr(w);
3807                // Make sure to unlock before we return!
3808                __cilkrts_frame_unlock(w, ff);
3809                return w;
3810            }
3811        } END_WITH_FRAME_LOCK(w, ff);
3812
3813        // If we get here, we have a nontrivial reduction to execute.
3814        middle_map = repeated_merge_reducer_maps(&w,
3815                                                 left_map,
3816                                                 middle_map);
3817        verify_current_wkr(w);
3818
3819        // Save any exceptions generated because of the reduction
3820        // process.  These get merged the next time around the
3821        // loop.
3822        CILK_ASSERT(NULL == ff->pending_exception);
3823        ff->pending_exception = w->l->pending_exception;
3824        w->l->pending_exception = NULL;
3825    }
3826
3827    // We should never break out of the loop above.
3828    CILK_ASSERT(0);
3829    return NULL;
3830}
3831
3832
3833/**
3834 * Execute reductions when ff stalls at a sync.
3835 *
3836 * Execution starts on w, but may finish on a different worker.
3837 * This method may acquire/release the lock on ff.
3838 *
3839 * @param w          The currently executing worker.
3840 * @param ff         The full frame of the spawned function at the sync
3841 * @param sf_at_sync The __cilkrts_stack_frame stalling at a sync
3842 * @return           The worker returning from this method.
3843 */
3844static __cilkrts_worker*
3845execute_reductions_for_sync(__cilkrts_worker *w,
3846                            full_frame *ff,
3847                            __cilkrts_stack_frame *sf_at_sync)
3848{
3849    int finished_reductions;
3850    // Step B1 from reducer protocol above:
3851    // Restore runtime invariants.
3852    //
3853    // The following code for this step is almost equivalent to
3854    // the following sequence:
3855    //   1. disown(w, ff, sf_at_sync, "sync") (which itself
3856    //        calls make_unrunnable(w, ff, sf_at_sync))
3857    //   2. make_runnable(w, ff, sf_at_sync).
3858    //
3859    // The "disown" will mark the frame "sf_at_sync"
3860    // as stolen and suspended, and save its place on the stack,
3861    // so it can be resumed after the sync.
3862    //
3863    // The difference is, that we don't want the disown to
3864    // break the following connections yet, since we are
3865    // about to immediately make sf/ff runnable again anyway.
3866    //   sf_at_sync->worker == w
3867    //   w->l->frame_ff == ff.
3868    //
3869    // These connections are needed for parallel reductions, since
3870    // we will use sf / ff as the stack frame / full frame for
3871    // executing any potential reductions.
3872    //
3873    // TBD: Can we refactor the disown / make_unrunnable code
3874    // to avoid the code duplication here?
3875
3876    ff->call_stack = NULL;
3877
3878    // Normally, "make_unrunnable" would add CILK_FRAME_STOLEN and
3879    // CILK_FRAME_SUSPENDED to sf_at_sync->flags and save the state of
3880    // the stack so that a worker can resume the frame in the correct
3881    // place.
3882    //
3883    // But on this path, CILK_FRAME_STOLEN should already be set.
3884    // Also, we technically don't want to suspend the frame until
3885    // the reduction finishes.
3886    // We do, however, need to save the stack before
3887    // we start any reductions, since the reductions might push more
3888    // data onto the stack.
3889    CILK_ASSERT(sf_at_sync->flags | CILK_FRAME_STOLEN);
3890
3891    __cilkrts_put_stack(ff, sf_at_sync);
3892    __cilkrts_make_unrunnable_sysdep(w, ff, sf_at_sync, 1,
3893                                     "execute_reductions_for_sync");
3894    CILK_ASSERT(w->l->frame_ff == ff);
3895
3896    // Step B2: Execute reductions on user stack.
3897    // Check if we have any "real" reductions to do.
3898    finished_reductions = fast_path_reductions_for_sync(w, ff);
3899
3900    if (!finished_reductions) {
3901        // Still have some real reductions to execute.
3902        // Run them here.
3903
3904        // This method may acquire/release the lock on ff.
3905        w = slow_path_reductions_for_sync(w, ff);
3906
3907        // The previous call may return on a different worker.
3908        // than what we started on.
3909        verify_current_wkr(w);
3910    }
3911
3912#if REDPAR_DEBUG >= 0
3913    CILK_ASSERT(w->l->frame_ff == ff);
3914    CILK_ASSERT(ff->call_stack == NULL);
3915#endif
3916
3917    // Now we suspend the frame ff (since we've
3918    // finished the reductions).  Roughly, we've split apart the
3919    // "make_unrunnable" call here --- we've already saved the
3920    // stack info earlier before the reductions execute.
3921    // All that remains is to restore the call stack back into the
3922    // full frame, and mark the frame as suspended.
3923    ff->call_stack = sf_at_sync;
3924    sf_at_sync->flags |= CILK_FRAME_SUSPENDED;
3925
3926    // At a nontrivial sync, we should always free the current fiber,
3927    // because it can not be leftmost.
3928    w->l->fiber_to_free = ff->fiber_self;
3929    ff->fiber_self = NULL;
3930    return w;
3931}
3932
3933
3934/*
3935  Local Variables: **
3936  c-file-style:"bsd" **
3937  c-basic-offset:4 **
3938  indent-tabs-mode:nil **
3939  End: **
3940*/
3941