199026Sjulian/* scheduler.c -*-C-*- 299026Sjulian * 399026Sjulian ************************************************************************* 499026Sjulian * 599026Sjulian * @copyright 699026Sjulian * Copyright (C) 2007-2013, Intel Corporation 799026Sjulian * All rights reserved. 899026Sjulian * 999026Sjulian * @copyright 1099026Sjulian * Redistribution and use in source and binary forms, with or without 1199026Sjulian * modification, are permitted provided that the following conditions 1299026Sjulian * are met: 1399026Sjulian * 1499026Sjulian * * Redistributions of source code must retain the above copyright 1599026Sjulian * notice, this list of conditions and the following disclaimer. 1699026Sjulian * * Redistributions in binary form must reproduce the above copyright 1799026Sjulian * notice, this list of conditions and the following disclaimer in 1899026Sjulian * the documentation and/or other materials provided with the 1999026Sjulian * distribution. 2099026Sjulian * * Neither the name of Intel Corporation nor the names of its 2199026Sjulian * contributors may be used to endorse or promote products derived 2299026Sjulian * from this software without specific prior written permission. 2399026Sjulian * 2499026Sjulian * @copyright 2599026Sjulian * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 2699026Sjulian * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 2799026Sjulian * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 2899026Sjulian * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 29116182Sobrien * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 30116182Sobrien * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 31116182Sobrien * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 3299026Sjulian * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 3399026Sjulian * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 3499026Sjulian * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 3599026Sjulian * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 3699026Sjulian * POSSIBILITY OF SUCH DAMAGE. 3799026Sjulian * 3899026Sjulian **************************************************************************/ 39107029Sjulian 4099026Sjulian/* 41105854Sjulian * Cilk scheduler 4299026Sjulian */ 43107126Sjeff 4499026Sjulian#include "scheduler.h" 4599026Sjulian#include "bug.h" 46107126Sjeff#include "os.h" 4799026Sjulian#include "os_mutex.h" 4899026Sjulian#include "local_state.h" 4999026Sjulian#include "signal_node.h" 5099026Sjulian#include "full_frame.h" 51103410Smini#include "sysdep.h" 5299026Sjulian#include "except.h" 5399026Sjulian#include "cilk_malloc.h" 54116355Salc#include "pedigrees.h" 5599026Sjulian#include "record-replay.h" 5699026Sjulian 5799026Sjulian#include <limits.h> 5899026Sjulian#include <string.h> /* memcpy */ 5999026Sjulian#include <stdio.h> // sprintf 60100273Speter#include <stdlib.h> // malloc, free, abort 61100273Speter 6299026Sjulian#ifdef _WIN32 63103367Sjulian# pragma warning(disable:1786) // disable warning: sprintf is deprecated 6499026Sjulian# include "sysdep-win.h" 65103367Sjulian# include "except-win32.h" 66103367Sjulian#endif // _WIN32 6799026Sjulian 68111028Sjeff// ICL: Don't complain about conversion from pointer to same-sized integral 6999026Sjulian// type in __cilkrts_put_stack. That's why we're using ptrdiff_t 70103367Sjulian#ifdef _WIN32 7199026Sjulian# pragma warning(disable: 1684) 72107719Sjulian#endif 73107719Sjulian 74107719Sjulian#include "cilk/cilk_api.h" 7599026Sjulian#include "frame_malloc.h" 76114268Sdavidxu#include "metacall_impl.h" 77107006Sdavidxu#include "reducer_impl.h" 78103367Sjulian#include "cilk-tbb-interop.h" 79103367Sjulian#include "cilk-ittnotify.h" 80114268Sdavidxu#include "stats.h" 81107006Sdavidxu 82107006Sdavidxu// ICL: Don't complain about loss of precision in myrand 83107006Sdavidxu// I tried restoring the warning after the function, but it didn't 84111115Sdavidxu// suppress it 85111115Sdavidxu#ifdef _WIN32 86111115Sdavidxu# pragma warning(disable: 2259) 87111115Sdavidxu#endif 88111028Sjeff 89111028Sjeff#ifndef _WIN32 9099026Sjulian# include <unistd.h> 9199026Sjulian#endif 92111028Sjeff 93105854Sjulian#ifdef __VXWORKS__ 94105854Sjulian// redeclare longjmp() with noreturn to stop warnings 95111028Sjeffextern __attribute__((noreturn)) 96111028Sjeff void longjmp(jmp_buf, int); 97111028Sjeff#endif 98111028Sjeff 9999026Sjulian//#define DEBUG_LOCKS 1 100107719Sjulian#ifdef DEBUG_LOCKS 101111028Sjeff// The currently executing worker must own this worker's lock 102111515Sdavidxu# define ASSERT_WORKER_LOCK_OWNED(w) \ 103111028Sjeff { \ 104105854Sjulian __cilkrts_worker *tls_worker = __cilkrts_get_tls_worker(); \ 105111028Sjeff CILK_ASSERT((w)->l->lock.owner == tls_worker); \ 106111028Sjeff } 107111028Sjeff#else 108111028Sjeff# define ASSERT_WORKER_LOCK_OWNED(w) 109111028Sjeff#endif // DEBUG_LOCKS 110111028Sjeff 111111028Sjeff// Options for the scheduler. 112111028Sjeffenum schedule_t { SCHEDULE_RUN, 113111028Sjeff SCHEDULE_WAIT, 114111028Sjeff SCHEDULE_EXIT }; 115111028Sjeff 116111028Sjeff// Return values for provably_good_steal() 117111028Sjeffenum provably_good_steal_t 118111028Sjeff{ 119111028Sjeff ABANDON_EXECUTION, // Not the last child to the sync - attempt to steal work 120111028Sjeff CONTINUE_EXECUTION, // Last child to the sync - continue executing on this worker 121111028Sjeff WAIT_FOR_CONTINUE // The replay log indicates that this was the worker 122111028Sjeff // which continued. Loop until we are the last worker 123111028Sjeff // to the sync. 124111028Sjeff}; 125111028Sjeff 126111028Sjeff 127111028Sjeff// Verify that "w" is the worker we are currently executing on. 128111028Sjeff// Because this check is expensive, this method is usually a no-op. 129111028Sjeffstatic inline void verify_current_wkr(__cilkrts_worker *w) 130111028Sjeff{ 131111028Sjeff#if ((REDPAR_DEBUG >= 3) || (FIBER_DEBUG >= 1)) 132111028Sjeff // Lookup the worker from TLS and compare to w. 133111028Sjeff __cilkrts_worker* tmp = __cilkrts_get_tls_worker(); 13499026Sjulian if (w != tmp) { 135107719Sjulian fprintf(stderr, "Error. W=%d, actual worker =%d...\n", 13699026Sjulian w->self, 13799026Sjulian tmp->self); 13899026Sjulian } 13999026Sjulian CILK_ASSERT(w == tmp); 14099026Sjulian#endif 14199026Sjulian} 14299026Sjulian 143103216Sjulianstatic enum schedule_t worker_runnable(__cilkrts_worker *w); 144113339Sjulian 14599026Sjulian// Scheduling-fiber functions: 14699026Sjulianstatic void do_return_from_spawn (__cilkrts_worker *w, 14799026Sjulian full_frame *ff, 14899026Sjulian __cilkrts_stack_frame *sf); 14999026Sjulianstatic void do_sync (__cilkrts_worker *w, 15099026Sjulian full_frame *ff, 15199026Sjulian __cilkrts_stack_frame *sf); 15299026Sjulian 15399026Sjulian// max is defined on Windows and VxWorks 15499026Sjulian#if (! defined(_WIN32)) && (! defined(__VXWORKS__)) 15599026Sjulian // TBD: definition of max() for Linux. 15699026Sjulian# define max(a, b) ((a) < (b) ? (b) : (a)) 15799026Sjulian#endif 15899026Sjulian 15999026Sjulianvoid __cilkrts_dump_stats_to_stderr(global_state_t *g) 160103216Sjulian{ 161103216Sjulian#ifdef CILK_PROFILE 162103216Sjulian int i; 16399026Sjulian for (i = 0; i < g->total_workers; ++i) { 16499026Sjulian // Print out statistics for each worker. We collected them, 16599026Sjulian // so why not print them out? 16699026Sjulian fprintf(stderr, "Stats for worker %d\n", i); 16799026Sjulian dump_stats_to_file(stderr, g->workers[i]->l->stats); 16899026Sjulian __cilkrts_accum_stats(&g->stats, g->workers[i]->l->stats); 16999026Sjulian } 170103216Sjulian 17199026Sjulian // Also print out aggregate statistics. 17299026Sjulian dump_stats_to_file(stderr, &g->stats); 17399026Sjulian#endif 17499026Sjulian fprintf(stderr, 17599026Sjulian "CILK PLUS Thread Info: P=%d, Q=%d\n", 17699026Sjulian g->P, 17799026Sjulian g->Q); 17899026Sjulian fprintf(stderr, 17999026Sjulian "CILK PLUS RUNTIME MEMORY USAGE: %lld bytes", 18099026Sjulian (long long)g->frame_malloc.allocated_from_os); 18199026Sjulian#ifdef CILK_PROFILE 18299026Sjulian if (g->stats.stack_hwm) 18399026Sjulian fprintf(stderr, ", %ld stacks", g->stats.stack_hwm); 18499026Sjulian#endif 18599026Sjulian fputc('\n', stderr); 18699026Sjulian} 18799026Sjulian 188103312Sjulianstatic void validate_worker(__cilkrts_worker *w) 189116355Salc{ 190103312Sjulian /* check the magic numbers, for debugging purposes */ 19199026Sjulian if (w->l->worker_magic_0 != WORKER_MAGIC_0 || 192107126Sjeff w->l->worker_magic_1 != WORKER_MAGIC_1) 19399026Sjulian abort_because_rts_is_corrupted(); 19499026Sjulian} 19599026Sjulian 19699026Sjulianstatic void double_link(full_frame *left_ff, full_frame *right_ff) 19799026Sjulian{ 19899026Sjulian if (left_ff) 19999026Sjulian left_ff->right_sibling = right_ff; 20099026Sjulian if (right_ff) 20199026Sjulian right_ff->left_sibling = left_ff; 20299026Sjulian} 20399026Sjulian 204116355Salc/* add CHILD to the right of all children of PARENT */ 20599026Sjulianstatic void push_child(full_frame *parent_ff, full_frame *child_ff) 206111028Sjeff{ 207107126Sjeff double_link(parent_ff->rightmost_child, child_ff); 208107126Sjeff double_link(child_ff, 0); 209107126Sjeff parent_ff->rightmost_child = child_ff; 210107126Sjeff} 211107126Sjeff 212107126Sjeff/* unlink CHILD from the list of all children of PARENT */ 213107126Sjeffstatic void unlink_child(full_frame *parent_ff, full_frame *child_ff) 21499026Sjulian{ 215107126Sjeff double_link(child_ff->left_sibling, child_ff->right_sibling); 216107126Sjeff 217107126Sjeff if (!child_ff->right_sibling) { 218111028Sjeff /* this is the rightmost child -- update parent link */ 219107126Sjeff CILK_ASSERT(parent_ff->rightmost_child == child_ff); 220107126Sjeff parent_ff->rightmost_child = child_ff->left_sibling; 221107126Sjeff } 222107126Sjeff child_ff->left_sibling = child_ff->right_sibling = 0; /* paranoia */ 223107126Sjeff} 224107126Sjeff 225107126Sjeffstatic void incjoin(full_frame *ff) 226107126Sjeff{ 227107126Sjeff ++ff->join_counter; 228107126Sjeff} 229107126Sjeff 230107126Sjeffstatic int decjoin(full_frame *ff) 231105854Sjulian{ 232111028Sjeff CILK_ASSERT(ff->join_counter > 0); 233105854Sjulian return (--ff->join_counter); 234105854Sjulian} 235105854Sjulian 236105854Sjulianstatic int simulate_decjoin(full_frame *ff) 237105854Sjulian{ 238105854Sjulian CILK_ASSERT(ff->join_counter > 0); 239105854Sjulian return (ff->join_counter - 1); 240105854Sjulian} 241111028Sjeff 242105854Sjulian/* 243105854Sjulian * Pseudo-random generator defined by the congruence S' = 69070 * S 244105854Sjulian * mod (2^32 - 5). Marsaglia (CACM July 1993) says on page 107 that 245111028Sjeff * this is a ``good one''. There you go. 246111028Sjeff * 247105854Sjulian * The literature makes a big fuss about avoiding the division, but 248105854Sjulian * for us it is not worth the hassle. 249105854Sjulian */ 250105854Sjulianstatic const unsigned RNGMOD = ((1ULL << 32) - 5); 251105854Sjulianstatic const unsigned RNGMUL = 69070U; 252105854Sjulian 253105854Sjulianstatic unsigned myrand(__cilkrts_worker *w) 254105854Sjulian{ 255105854Sjulian unsigned state = w->l->rand_seed; 256105854Sjulian state = (unsigned)((RNGMUL * (unsigned long long)state) % RNGMOD); 257111028Sjeff w->l->rand_seed = state; 258111028Sjeff return state; 259111028Sjeff} 260105854Sjulian 261111028Sjeffstatic void mysrand(__cilkrts_worker *w, unsigned seed) 262111028Sjeff{ 263105854Sjulian seed %= RNGMOD; 264105854Sjulian seed += (seed == 0); /* 0 does not belong to the multiplicative 265105854Sjulian group. Use 1 instead */ 266105854Sjulian w->l->rand_seed = seed; 267105854Sjulian} 268105854Sjulian 269105854Sjulian/* W grabs its own lock */ 270105854Sjulianvoid __cilkrts_worker_lock(__cilkrts_worker *w) 271105854Sjulian{ 272105854Sjulian validate_worker(w); 273105854Sjulian CILK_ASSERT(w->l->do_not_steal == 0); 274105854Sjulian 275105854Sjulian /* tell thieves to stay out of the way */ 276105854Sjulian w->l->do_not_steal = 1; 277111028Sjeff __cilkrts_fence(); /* probably redundant */ 278111028Sjeff 279111028Sjeff __cilkrts_mutex_lock(w, &w->l->lock); 280111028Sjeff} 281111028Sjeff 282111028Sjeffvoid __cilkrts_worker_unlock(__cilkrts_worker *w) 283111028Sjeff{ 284105854Sjulian __cilkrts_mutex_unlock(w, &w->l->lock); 285111028Sjeff CILK_ASSERT(w->l->do_not_steal == 1); 286111028Sjeff /* The fence is probably redundant. Use a release 287111028Sjeff operation when supported (gcc and compatibile); 288111028Sjeff that is faster on x86 which serializes normal stores. */ 289111028Sjeff#if defined __GNUC__ && (__GNUC__ * 10 + __GNUC_MINOR__ > 43 || __ICC >= 1110) 290111028Sjeff __sync_lock_release(&w->l->do_not_steal); 291105854Sjulian#else 292105854Sjulian w->l->do_not_steal = 0; 293105854Sjulian __cilkrts_fence(); /* store-store barrier, redundant on x86 */ 294105854Sjulian#endif 295105854Sjulian} 296105854Sjulian 297105854Sjulian/* try to acquire the lock of some *other* worker */ 298105854Sjulianstatic int worker_trylock_other(__cilkrts_worker *w, 299105854Sjulian __cilkrts_worker *other) 300105854Sjulian{ 301111028Sjeff int status = 0; 302111028Sjeff 303111028Sjeff validate_worker(other); 304111028Sjeff 305105854Sjulian /* This protocol guarantees that, after setting the DO_NOT_STEAL 306105854Sjulian flag, worker W can enter its critical section after waiting for 307105854Sjulian the thief currently in the critical section (if any) and at 308105854Sjulian most one other thief. 309105854Sjulian 310105854Sjulian This requirement is overly paranoid, but it should protect us 311105854Sjulian against future nonsense from OS implementors. 312105854Sjulian */ 313105854Sjulian 314111028Sjeff /* compete for the right to disturb OTHER */ 315111028Sjeff if (__cilkrts_mutex_trylock(w, &other->l->steal_lock)) { 316111028Sjeff if (other->l->do_not_steal) { 317111028Sjeff /* leave it alone */ 318111028Sjeff } else { 319111125Sdavidxu status = __cilkrts_mutex_trylock(w, &other->l->lock); 320111028Sjeff } 321111028Sjeff __cilkrts_mutex_unlock(w, &other->l->steal_lock); 322111028Sjeff } 323111028Sjeff 324111028Sjeff 325111028Sjeff return status; 326111028Sjeff} 327111028Sjeff 328111028Sjeffstatic void worker_unlock_other(__cilkrts_worker *w, 329111028Sjeff __cilkrts_worker *other) 330111028Sjeff{ 331111028Sjeff __cilkrts_mutex_unlock(w, &other->l->lock); 332111028Sjeff} 333111028Sjeff 334111028Sjeff 335111028Sjeff/* Lock macro Usage: 336111028Sjeff BEGIN_WITH_WORKER_LOCK(w) { 337111028Sjeff statement; 338111028Sjeff statement; 339111028Sjeff BEGIN_WITH_FRAME_LOCK(w, ff) { 340111028Sjeff statement; 341111028Sjeff statement; 342111028Sjeff } END_WITH_FRAME_LOCK(w, ff); 343111028Sjeff } END_WITH_WORKER_LOCK(w); 344111028Sjeff */ 345111028Sjeff#define BEGIN_WITH_WORKER_LOCK(w) __cilkrts_worker_lock(w); do 346111028Sjeff#define END_WITH_WORKER_LOCK(w) while (__cilkrts_worker_unlock(w), 0) 347111028Sjeff 348111028Sjeff// TBD(jsukha): These are worker lock acquistions on 349111028Sjeff// a worker whose deque is empty. My conjecture is that we 350111028Sjeff// do not need to hold the worker lock at these points. 351111028Sjeff// I have left them in for now, however. 352111028Sjeff// 353111028Sjeff// #define REMOVE_POSSIBLY_OPTIONAL_LOCKS 354111028Sjeff#ifdef REMOVE_POSSIBLY_OPTIONAL_LOCKS 355111028Sjeff #define BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) do 356111028Sjeff #define END_WITH_WORKER_LOCK_OPTIONAL(w) while (0) 357111028Sjeff#else 358111028Sjeff #define BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) __cilkrts_worker_lock(w); do 359111028Sjeff #define END_WITH_WORKER_LOCK_OPTIONAL(w) while (__cilkrts_worker_unlock(w), 0) 360111028Sjeff#endif 361111028Sjeff 362111028Sjeff 363111028Sjeff#define BEGIN_WITH_FRAME_LOCK(w, ff) \ 36499026Sjulian do { full_frame *_locked_ff = ff; __cilkrts_frame_lock(w, _locked_ff); do 365111028Sjeff 366111028Sjeff#define END_WITH_FRAME_LOCK(w, ff) \ 367105854Sjulian while (__cilkrts_frame_unlock(w, _locked_ff), 0); } while (0) 368105854Sjulian 369105854Sjulian/* W becomes the owner of F and F can be stolen from W */ 370111028Sjeffstatic void make_runnable(__cilkrts_worker *w, full_frame *ff) 371105854Sjulian{ 372105854Sjulian w->l->frame_ff = ff; 373105854Sjulian 374105854Sjulian /* CALL_STACK is invalid (the information is stored implicitly in W) */ 375105854Sjulian ff->call_stack = 0; 376105854Sjulian} 377105854Sjulian 378105854Sjulian/* 379105854Sjulian * The worker parameter is unused, except for print-debugging purposes. 380105854Sjulian */ 381105854Sjulianstatic void make_unrunnable(__cilkrts_worker *w, 382105854Sjulian full_frame *ff, 383105854Sjulian __cilkrts_stack_frame *sf, 384111028Sjeff int is_loot, 385111028Sjeff const char *why) 386111028Sjeff{ 387116963Sdavidxu /* CALL_STACK becomes valid again */ 388111028Sjeff ff->call_stack = sf; 389111028Sjeff 390105854Sjulian if (sf) { 391105854Sjulian#if CILK_LIB_DEBUG 392105854Sjulian if (__builtin_expect(sf->flags & CILK_FRAME_EXITING, 0)) 393106180Sdavidxu __cilkrts_bug("W%d suspending exiting frame %p/%p\n", w->self, ff, sf); 394106180Sdavidxu#endif 395116963Sdavidxu sf->flags |= CILK_FRAME_STOLEN | CILK_FRAME_SUSPENDED; 396105854Sjulian sf->worker = 0; 397106242Sdavidxu 398116963Sdavidxu if (is_loot) 399116963Sdavidxu __cilkrts_put_stack(ff, sf); 400106242Sdavidxu 401116963Sdavidxu /* perform any system-dependent action, such as saving the 402116963Sdavidxu state of the stack */ 403106180Sdavidxu __cilkrts_make_unrunnable_sysdep(w, ff, sf, is_loot, why); 404106180Sdavidxu } 405116963Sdavidxu} 406116963Sdavidxu 407116963Sdavidxu 408116963Sdavidxu/* Push the next full frame to be made active in this worker and increment its 409116963Sdavidxu * join counter. __cilkrts_push_next_frame and pop_next_frame work on a 410116963Sdavidxu * one-element queue. This queue is used to communicate across the runtime 411116963Sdavidxu * from the code that wants to activate a frame to the code that can actually 412116963Sdavidxu * begin execution on that frame. They are asymetrical in that push 413116963Sdavidxu * increments the join counter but pop does not decrement it. Rather, a 414116963Sdavidxu * single push/pop combination makes a frame active and increments its join 415116963Sdavidxu * counter once. */ 416116963Sdavidxuvoid __cilkrts_push_next_frame(__cilkrts_worker *w, full_frame *ff) 417116963Sdavidxu{ 418116963Sdavidxu CILK_ASSERT(ff); 419116963Sdavidxu CILK_ASSERT(!w->l->next_frame_ff); 420116963Sdavidxu incjoin(ff); 421116963Sdavidxu w->l->next_frame_ff = ff; 422116963Sdavidxu} 423116963Sdavidxu 424116963Sdavidxu/* Get the next full-frame to be made active in this worker. The join count 425116963Sdavidxu * of the full frame will have been incremented by the corresponding push 426116963Sdavidxu * event. See __cilkrts_push_next_frame, above. 427116963Sdavidxu */ 428116963Sdavidxustatic full_frame *pop_next_frame(__cilkrts_worker *w) 429116963Sdavidxu{ 430116963Sdavidxu full_frame *ff; 431116963Sdavidxu ff = w->l->next_frame_ff; 432106180Sdavidxu // Remove the frame from the next_frame field. 433116963Sdavidxu // 434106180Sdavidxu // If this is a user worker, then there is a chance that another worker 435116963Sdavidxu // from our team could push work into our next_frame (if it is the last 436116963Sdavidxu // worker doing work for this team). The other worker's setting of the 437105854Sjulian // next_frame could race with our setting of next_frame to NULL. This is 438105854Sjulian // the only possible race condition on next_frame. However, if next_frame 439111028Sjeff // has a non-NULL value, then it means the team still has work to do, and 440111028Sjeff // there is no chance of another team member populating next_frame. Thus, 441111028Sjeff // it is safe to set next_frame to NULL, if it was populated. There is no 442111028Sjeff // need for an atomic op. 443111028Sjeff if (NULL != ff) { 444105854Sjulian w->l->next_frame_ff = NULL; 445105854Sjulian } 446105854Sjulian return ff; 447105854Sjulian} 448105854Sjulian 449108640Sdavidxu/* 450115790Sjulian * Identify the single worker that is allowed to cross a sync in this frame. A 451115790Sjulian * thief should call this function when it is the first to steal work from a 452105854Sjulian * user worker. "First to steal work" may mean that there has been parallelism 453105854Sjulian * in the user worker before, but the whole team sync'd, and this is the first 454115790Sjulian * steal after that. 455106182Sdavidxu * 456105854Sjulian * This should happen while holding the worker and frame lock. 457115790Sjulian */ 458105854Sjulianstatic void set_sync_master(__cilkrts_worker *w, full_frame *ff) 459105854Sjulian{ 460115790Sjulian w->l->last_full_frame = ff; 461115790Sjulian ff->sync_master = w; 462115790Sjulian} 463115790Sjulian 464115790Sjulian/* 465115790Sjulian * The sync that ends all parallelism for a particular user worker is about to 466105854Sjulian * be crossed. Decouple the worker and frame. 467105854Sjulian * 468105854Sjulian * No locks need to be held since the user worker isn't doing anything, and none 469105854Sjulian * of the system workers can steal from it. But unset_sync_master() should be 470115790Sjulian * called before the user worker knows about this work (i.e., before it is 471115790Sjulian * inserted into the w->l->next_frame_ff is set). 472115790Sjulian */ 473115790Sjulianstatic void unset_sync_master(__cilkrts_worker *w, full_frame *ff) 474115790Sjulian{ 475115790Sjulian CILK_ASSERT(WORKER_USER == w->l->type); 476115790Sjulian CILK_ASSERT(ff->sync_master == w); 477115790Sjulian ff->sync_master = NULL; 478115790Sjulian w->l->last_full_frame = NULL; 479108640Sdavidxu} 480108640Sdavidxu 481111028Sjeff/******************************************************************** 482116361Sdavidxu * THE protocol: 483105854Sjulian ********************************************************************/ 484105854Sjulian/* 485105854Sjulian * This is a protocol for work stealing that minimizes the overhead on 486111028Sjeff * the victim. 487111028Sjeff * 488111028Sjeff * The protocol uses three shared pointers into the worker's deque: 489111028Sjeff * - T - the "tail" 490112071Sdavidxu * - H - the "head" 491105854Sjulian * - E - the "exception" NB: In this case, "exception" has nothing to do 492105854Sjulian * with C++ throw-catch exceptions -- it refers only to a non-normal return, 493105854Sjulian * i.e., a steal or similar scheduling exception. 494106182Sdavidxu * 495105854Sjulian * with H <= E, H <= T. 496105854Sjulian * 497107719Sjulian * Stack frames SF, where H <= E < T, are available for stealing. 498108338Sjulian * 499111028Sjeff * The worker operates on the T end of the stack. The frame being 500107719Sjulian * worked on is not on the stack. To make a continuation available for 501111028Sjeff * stealing the worker pushes a from onto the stack: stores *T++ = SF. 502111028Sjeff * To return, it pops the frame off the stack: obtains SF = *--T. 503111169Sdavidxu * 504111028Sjeff * After decrementing T, the condition E > T signals to the victim that 505111028Sjeff * it should invoke the runtime system's "THE" exception handler. The 506105854Sjulian * pointer E can become INFINITY, in which case the victim must invoke 507111028Sjeff * the THE exception handler as soon as possible. 508105854Sjulian * 509105854Sjulian * See "The implementation of the Cilk-5 multithreaded language", PLDI 1998, 510107719Sjulian * http://portal.acm.org/citation.cfm?doid=277652.277725, for more information 511116401Sdavidxu * on the THE protocol. 512116401Sdavidxu */ 513111169Sdavidxu 514116963Sdavidxu/* the infinity value of E */ 515111169Sdavidxu#define EXC_INFINITY ((__cilkrts_stack_frame **) (-1)) 516105854Sjulian 517105854Sjulianstatic void increment_E(__cilkrts_worker *victim) 518107719Sjulian{ 519116401Sdavidxu __cilkrts_stack_frame *volatile *tmp; 520107719Sjulian 521111169Sdavidxu // The currently executing worker must own the worker lock to touch 522111169Sdavidxu // victim->exc 523111169Sdavidxu ASSERT_WORKER_LOCK_OWNED(victim); 524111169Sdavidxu 525111169Sdavidxu tmp = victim->exc; 526116401Sdavidxu if (tmp != EXC_INFINITY) { 527116401Sdavidxu /* On most x86 this pair of operations would be slightly faster 528116963Sdavidxu as an atomic exchange due to the implicit memory barrier in 529116963Sdavidxu an atomic instruction. */ 530116963Sdavidxu victim->exc = tmp + 1; 531116963Sdavidxu __cilkrts_fence(); 532116963Sdavidxu } 533116963Sdavidxu} 534116963Sdavidxu 535111169Sdavidxustatic void decrement_E(__cilkrts_worker *victim) 536116963Sdavidxu{ 537116963Sdavidxu __cilkrts_stack_frame *volatile *tmp; 538116963Sdavidxu 539116963Sdavidxu // The currently executing worker must own the worker lock to touch 540116963Sdavidxu // victim->exc 541116963Sdavidxu ASSERT_WORKER_LOCK_OWNED(victim); 542116963Sdavidxu 543116963Sdavidxu tmp = victim->exc; 544116963Sdavidxu if (tmp != EXC_INFINITY) { 545116963Sdavidxu /* On most x86 this pair of operations would be slightly faster 546116963Sdavidxu as an atomic exchange due to the implicit memory barrier in 547116963Sdavidxu an atomic instruction. */ 548116963Sdavidxu victim->exc = tmp - 1; 549116963Sdavidxu __cilkrts_fence(); /* memory fence not really necessary */ 550116963Sdavidxu } 551116963Sdavidxu} 552116963Sdavidxu 553116963Sdavidxu#if 0 554116963Sdavidxu/* for now unused, will be necessary if we implement abort */ 555105854Sjulianstatic void signal_THE_exception(__cilkrts_worker *wparent) 556116401Sdavidxu{ 557116401Sdavidxu wparent->exc = EXC_INFINITY; 558116401Sdavidxu __cilkrts_fence(); 559116401Sdavidxu} 560116401Sdavidxu#endif 561107719Sjulian 562105854Sjulianstatic void reset_THE_exception(__cilkrts_worker *w) 563105854Sjulian{ 564105854Sjulian // The currently executing worker must own the worker lock to touch 565105854Sjulian // w->exc 566105854Sjulian ASSERT_WORKER_LOCK_OWNED(w); 567105854Sjulian 568105854Sjulian w->exc = w->head; 569105854Sjulian __cilkrts_fence(); 570105854Sjulian} 571105854Sjulian 572111028Sjeff/* conditions under which victim->head can be stolen: */ 573108338Sjulianstatic int can_steal_from(__cilkrts_worker *victim) 574105854Sjulian{ 575105854Sjulian return ((victim->head < victim->tail) && 576108338Sjulian (victim->head < victim->protected_tail)); 577111028Sjeff} 578105854Sjulian 579116361Sdavidxu/* Return TRUE if the frame can be stolen, false otherwise */ 580111028Sjeffstatic int dekker_protocol(__cilkrts_worker *victim) 581111028Sjeff{ 582108613Sjulian // increment_E and decrement_E are going to touch victim->exc. The 583105854Sjulian // currently executing worker must own victim's lock before they can 584105854Sjulian // modify it 585111028Sjeff ASSERT_WORKER_LOCK_OWNED(victim); 586111207Sdavidxu 587111028Sjeff /* ASSERT(E >= H); */ 588108613Sjulian 589111028Sjeff increment_E(victim); 590108338Sjulian 591105854Sjulian /* ASSERT(E >= H + 1); */ 592105854Sjulian if (can_steal_from(victim)) { 593105854Sjulian /* success, we can steal victim->head and set H <- H + 1 594111028Sjeff in detach() */ 595111028Sjeff return 1; 596111028Sjeff } else { 597111028Sjeff /* failure, restore previous state */ 598111028Sjeff decrement_E(victim); 599108338Sjulian return 0; 600111028Sjeff } 601105854Sjulian} 602111028Sjeff 603111028Sjeff 604111028Sjeff/* Link PARENT and CHILD in the spawn tree */ 605111028Sjeffstatic full_frame *make_child(__cilkrts_worker *w, 606116963Sdavidxu full_frame *parent_ff, 607116963Sdavidxu __cilkrts_stack_frame *child_sf, 608116963Sdavidxu cilk_fiber *fiber) 609111028Sjeff{ 610111028Sjeff full_frame *child_ff = __cilkrts_make_full_frame(w, child_sf); 611111028Sjeff 612108613Sjulian child_ff->parent = parent_ff; 613105854Sjulian push_child(parent_ff, child_ff); 614111028Sjeff 615108338Sjulian //DBGPRINTF("%d- make_child - child_frame: %p, parent_frame: %p, child_sf: %p\n" 616108613Sjulian // " parent - parent: %p, left_sibling: %p, right_sibling: %p, rightmost_child: %p\n" 617105854Sjulian // " child - parent: %p, left_sibling: %p, right_sibling: %p, rightmost_child: %p\n", 618111028Sjeff // w->self, child, parent, child_sf, 619108338Sjulian // parent->parent, parent->left_sibling, parent->right_sibling, parent->rightmost_child, 620105854Sjulian // child->parent, child->left_sibling, child->right_sibling, child->rightmost_child); 621105854Sjulian CILK_ASSERT(parent_ff->call_stack); 622105854Sjulian child_ff->is_call_child = (fiber == NULL); 623105854Sjulian 624111028Sjeff /* PLACEHOLDER_FIBER is used as non-null marker indicating that 625105854Sjulian child should be treated as a spawn child even though we have not 626105854Sjulian yet assigned a real fiber to its parent. */ 627105854Sjulian if (fiber == PLACEHOLDER_FIBER) 628105854Sjulian fiber = NULL; /* Parent actually gets a null fiber, for now */ 629105854Sjulian 630105854Sjulian /* perform any system-dependent actions, such as capturing 631105854Sjulian parameter passing information */ 632105854Sjulian /*__cilkrts_make_child_sysdep(child, parent);*/ 633105854Sjulian 634105854Sjulian /* Child gets reducer map and stack of parent. 635105854Sjulian Parent gets a new map and new stack. */ 636105854Sjulian child_ff->fiber_self = parent_ff->fiber_self; 637105854Sjulian child_ff->sync_master = NULL; 638111028Sjeff 639116401Sdavidxu if (child_ff->is_call_child) { 640116401Sdavidxu /* Cause segfault on any attempted access. The parent gets 641105854Sjulian the child map and stack when the child completes. */ 642105854Sjulian parent_ff->fiber_self = 0; 643105854Sjulian } else { 644105854Sjulian parent_ff->fiber_self = fiber; 645105854Sjulian } 646111028Sjeff 647111028Sjeff incjoin(parent_ff); 648111028Sjeff return child_ff; 649111028Sjeff} 650111028Sjeff 651111028Sjeffstatic inline __cilkrts_stack_frame *__cilkrts_advance_frame(__cilkrts_stack_frame *sf) 652116401Sdavidxu{ 653111028Sjeff __cilkrts_stack_frame *p = sf->call_parent; 654116401Sdavidxu sf->call_parent = 0; 655116401Sdavidxu return p; 656116440Sdavidxu} 657116440Sdavidxu 658112078Sdavidxu/* w should be the currently executing worker. 659116401Sdavidxu * loot_sf is the youngest stack frame in the call stack being 660116401Sdavidxu * unrolled (i.e., the most deeply nested stack frame.) 661116401Sdavidxu * 662116401Sdavidxu * When this method is called for a steal, loot_sf should be on a 663112078Sdavidxu * victim worker which is different from w. 664116401Sdavidxu * For CILK_FORCE_REDUCE, the victim worker will equal w. 665116401Sdavidxu * 666105854Sjulian * Before execution, the __cilkrts_stack_frame's have pointers from 667105854Sjulian * older to younger, i.e., a __cilkrts_stack_frame points to parent. 668111028Sjeff * 669116401Sdavidxu * This method creates a full frame for each __cilkrts_stack_frame in 670107006Sdavidxu * the call stack, with each full frame also pointing to its parent. 671105854Sjulian * 672105854Sjulian * The method returns the full frame created for loot_sf, i.e., the 673105854Sjulian * youngest full frame. 674111028Sjeff */ 675105854Sjulianstatic full_frame *unroll_call_stack(__cilkrts_worker *w, 676105854Sjulian full_frame *ff, 677105854Sjulian __cilkrts_stack_frame *const loot_sf) 678111028Sjeff{ 679105854Sjulian __cilkrts_stack_frame *sf = loot_sf; 680105854Sjulian __cilkrts_stack_frame *rev_sf = 0; 681111028Sjeff __cilkrts_stack_frame *t_sf; 682111028Sjeff 683111028Sjeff CILK_ASSERT(sf); 684111677Sdavidxu /*CILK_ASSERT(sf->call_parent != sf);*/ 685111028Sjeff 686111028Sjeff /* The leafmost frame is unsynched. */ 687111677Sdavidxu if (sf->worker != w) 688111028Sjeff sf->flags |= CILK_FRAME_UNSYNCHED; 689105854Sjulian 690116452Sdavidxu /* Reverse the call stack to make a linked list ordered from parent 691116452Sdavidxu to child. sf->call_parent points to the child of SF instead of 692111028Sjeff the parent. */ 693111028Sjeff do { 694111028Sjeff t_sf = (sf->flags & (CILK_FRAME_DETACHED|CILK_FRAME_STOLEN|CILK_FRAME_LAST))? 0 : sf->call_parent; 695111028Sjeff sf->call_parent = rev_sf; 696111028Sjeff rev_sf = sf; 697111028Sjeff sf = t_sf; 698111028Sjeff } while (sf); 699111028Sjeff sf = rev_sf; 700111028Sjeff 701111028Sjeff /* Promote each stack frame to a full frame in order from parent 702111028Sjeff to child, following the reversed list we just built. */ 703111028Sjeff make_unrunnable(w, ff, sf, sf == loot_sf, "steal 1"); 704116401Sdavidxu /* T is the *child* of SF, because we have reversed the list */ 705116401Sdavidxu for (t_sf = __cilkrts_advance_frame(sf); t_sf; 706116401Sdavidxu sf = t_sf, t_sf = __cilkrts_advance_frame(sf)) { 707116401Sdavidxu ff = make_child(w, ff, t_sf, NULL); 708116401Sdavidxu make_unrunnable(w, ff, t_sf, t_sf == loot_sf, "steal 2"); 709111028Sjeff } 710111028Sjeff 711111028Sjeff /* XXX What if the leafmost frame does not contain a sync 712111028Sjeff and this steal is from promote own deque? */ 713116401Sdavidxu /*sf->flags |= CILK_FRAME_UNSYNCHED;*/ 714116401Sdavidxu 715116401Sdavidxu CILK_ASSERT(!sf->call_parent); 716116401Sdavidxu return ff; 717116401Sdavidxu} 718116401Sdavidxu 719116401Sdavidxu/* detach the top of the deque frame from the VICTIM and install a new 720105854Sjulian CHILD frame in its place */ 721111028Sjeffstatic void detach_for_steal(__cilkrts_worker *w, 722105854Sjulian __cilkrts_worker *victim, 723111028Sjeff cilk_fiber* fiber) 724111028Sjeff{ 725105854Sjulian /* ASSERT: we own victim->lock */ 726111028Sjeff 727111028Sjeff full_frame *parent_ff, *child_ff, *loot_ff; 728111028Sjeff __cilkrts_stack_frame *volatile *h; 729111028Sjeff __cilkrts_stack_frame *sf; 730105854Sjulian 731111028Sjeff w->l->team = victim->l->team; 732111028Sjeff 733111028Sjeff CILK_ASSERT(w->l->frame_ff == 0 || w == victim); 734111028Sjeff 735111028Sjeff h = victim->head; 736105854Sjulian 737111028Sjeff CILK_ASSERT(*h); 738111028Sjeff 739111028Sjeff victim->head = h + 1; 740111028Sjeff 741111028Sjeff parent_ff = victim->l->frame_ff; 742111028Sjeff BEGIN_WITH_FRAME_LOCK(w, parent_ff) { 743111028Sjeff /* parent no longer referenced by victim */ 744111028Sjeff decjoin(parent_ff); 745111028Sjeff 746111028Sjeff /* obtain the victim call stack */ 747116963Sdavidxu sf = *h; 748111028Sjeff 749116963Sdavidxu /* perform system-dependent normalizations */ 750111028Sjeff /*__cilkrts_normalize_call_stack_on_steal(sf);*/ 751111028Sjeff 752111028Sjeff /* unroll PARENT_FF with call stack SF, adopt the youngest 753116963Sdavidxu frame LOOT. If loot_ff == parent_ff, then we hold loot_ff->lock, 754116963Sdavidxu otherwise, loot_ff is newly created and we can modify it without 755116963Sdavidxu holding its lock. */ 756116963Sdavidxu loot_ff = unroll_call_stack(w, parent_ff, sf); 757116963Sdavidxu 758116963Sdavidxu #if REDPAR_DEBUG >= 3 759116963Sdavidxu fprintf(stderr, "[W=%d, victim=%d, desc=detach, parent_ff=%p, loot=%p]\n", 760116963Sdavidxu w->self, victim->self, 761111028Sjeff parent_ff, loot_ff); 762112397Sdavidxu #endif 763112397Sdavidxu 764111028Sjeff if (WORKER_USER == victim->l->type && 765111028Sjeff NULL == victim->l->last_full_frame) { 766111028Sjeff // Mark this looted frame as special: only the original user worker 767111028Sjeff // may cross the sync. 768111028Sjeff // 769111028Sjeff // This call is a shared access to 770111028Sjeff // victim->l->last_full_frame. 771111028Sjeff set_sync_master(victim, loot_ff); 772111028Sjeff } 773111028Sjeff 774116401Sdavidxu /* LOOT is the next frame that the thief W is supposed to 775105854Sjulian run, unless the thief is stealing from itself, in which 776105854Sjulian case the thief W == VICTIM executes CHILD and nobody 777111028Sjeff executes LOOT. */ 778111028Sjeff if (w == victim) { 779105854Sjulian /* Pretend that frame has been stolen */ 780111028Sjeff loot_ff->call_stack->flags |= CILK_FRAME_UNSYNCHED; 781111028Sjeff loot_ff->simulated_stolen = 1; 782111028Sjeff } 783116401Sdavidxu else 784111028Sjeff __cilkrts_push_next_frame(w, loot_ff); 785111028Sjeff 786111028Sjeff // After this "push_next_frame" call, w now owns loot_ff. 787111028Sjeff child_ff = make_child(w, loot_ff, 0, fiber); 788116401Sdavidxu 789111028Sjeff BEGIN_WITH_FRAME_LOCK(w, child_ff) { 790105854Sjulian /* install child in the victim's work queue, taking 791116401Sdavidxu the parent_ff's place */ 792116401Sdavidxu /* child is referenced by victim */ 793116401Sdavidxu incjoin(child_ff); 794116607Sdavidxu 795116607Sdavidxu // With this call, w is bestowing ownership of the newly 796116607Sdavidxu // created frame child_ff to the victim, and victim is 797116607Sdavidxu // giving up ownership of parent_ff. 798116607Sdavidxu // 799116401Sdavidxu // Worker w will either take ownership of parent_ff 800116401Sdavidxu // if parent_ff == loot_ff, or parent_ff will be 801116401Sdavidxu // suspended. 802116607Sdavidxu // 803116607Sdavidxu // Note that this call changes the victim->frame_ff 804111028Sjeff // while the victim may be executing. 805105854Sjulian make_runnable(victim, child_ff); 806105854Sjulian } END_WITH_FRAME_LOCK(w, child_ff); 807105854Sjulian } END_WITH_FRAME_LOCK(w, parent_ff); 808105854Sjulian} 80999026Sjulian 81099026Sjulian/** 81199026Sjulian * @brief cilk_fiber_proc that resumes user code after a successful 81299026Sjulian * random steal. 81399026Sjulian 81499026Sjulian * This function longjmps back into the user code whose state is 815107126Sjeff * stored in cilk_fiber_get_data(fiber)->resume_sf. The stack pointer 81699026Sjulian * is adjusted so that the code resumes on the specified fiber stack 81799026Sjulian * instead of its original stack. 818107126Sjeff * 819107126Sjeff * This method gets executed only on a fiber freshly allocated from a 820103367Sjulian * pool. 821107126Sjeff * 822107126Sjeff * @param fiber The fiber being used to resume user code. 823103367Sjulian * @param arg Unused. 824111028Sjeff */ 825111028Sjeffstatic 82699026Sjulianvoid fiber_proc_to_resume_user_code_for_random_steal(cilk_fiber *fiber) 82799026Sjulian{ 82899026Sjulian cilk_fiber_data *data = cilk_fiber_get_data(fiber); 829103002Sjulian __cilkrts_stack_frame* sf = data->resume_sf; 83099026Sjulian full_frame *ff; 83199026Sjulian 83299026Sjulian CILK_ASSERT(sf); 83399026Sjulian 834111028Sjeff // When we pull the resume_sf out of the fiber to resume it, clear 83599026Sjulian // the old value. 836111028Sjeff data->resume_sf = NULL; 83799026Sjulian CILK_ASSERT(sf->worker == data->owner); 83899026Sjulian ff = sf->worker->l->frame_ff; 839103410Smini 840105854Sjulian // For Win32, we need to overwrite the default exception handler 841105854Sjulian // in this function, so that when the OS exception handling code 842105854Sjulian // walks off the top of the current Cilk stack, it reaches our stub 843105854Sjulian // handler. 844105854Sjulian 845111028Sjeff // Also, this function needs to be wrapped into a try-catch block 846105854Sjulian // so the compiler generates the appropriate exception information 847111028Sjeff // in this frame. 848105854Sjulian 849105854Sjulian // TBD: IS THIS HANDLER IN THE WRONG PLACE? Can we longjmp out of 850105854Sjulian // this function (and does it matter?) 851111028Sjeff#if defined(_WIN32) && !defined(_WIN64) 852111028Sjeff install_exception_stub_handler(); 853111028Sjeff __try 854111028Sjeff#endif 855111028Sjeff { 856111028Sjeff char* new_sp = sysdep_reset_jump_buffers_for_resume(fiber, ff, sf); 857111028Sjeff 858111028Sjeff // Notify the Intel tools that we're stealing code 859111028Sjeff ITT_SYNC_ACQUIRED(sf->worker); 860111028Sjeff NOTIFY_ZC_INTRINSIC("cilk_continue", sf); 861111028Sjeff 862111028Sjeff // TBD: We'd like to move TBB-interop methods into the fiber 863105854Sjulian // eventually. 864105854Sjulian cilk_fiber_invoke_tbb_stack_op(fiber, CILK_TBB_STACK_ADOPT); 865105854Sjulian 866105854Sjulian sf->flags &= ~CILK_FRAME_SUSPENDED; 867105854Sjulian 868111028Sjeff // longjmp to user code. Don't process exceptions here, 869105854Sjulian // because we are resuming a stolen frame. 870111028Sjeff sysdep_longjmp_to_sf(new_sp, sf, NULL); 871105854Sjulian /*NOTREACHED*/ 872105854Sjulian // Intel's C compiler respects the preceding lint pragma 873105854Sjulian } 874111028Sjeff#if defined(_WIN32) && !defined(_WIN64) 87599026Sjulian __except (CILK_ASSERT(!"should not execute the the stub filter"), 87699026Sjulian EXCEPTION_EXECUTE_HANDLER) 87799026Sjulian { 87899026Sjulian // If we are here, that means something very wrong 879105854Sjulian // has happened in our exception processing... 880105854Sjulian CILK_ASSERT(! "should not be here!"); 881105854Sjulian } 882111028Sjeff#endif 88399026Sjulian} 88499026Sjulian 885111028Sjeffstatic void random_steal(__cilkrts_worker *w) 886111028Sjeff{ 88799026Sjulian __cilkrts_worker *victim = NULL; 888105854Sjulian cilk_fiber *fiber = NULL; 889105854Sjulian int n; 890111028Sjeff int success = 0; 891111028Sjeff int32_t victim_id; 892111028Sjeff 893105854Sjulian // Nothing's been stolen yet. When true, this will flag 894105854Sjulian // setup_for_execution_pedigree to increment the pedigree 895105854Sjulian w->l->work_stolen = 0; 896111028Sjeff 897105854Sjulian /* If the user has disabled stealing (using the debugger) we fail */ 898105854Sjulian if (__builtin_expect(w->g->stealing_disabled, 0)) 899105854Sjulian return; 900105854Sjulian 901105854Sjulian CILK_ASSERT(w->l->type == WORKER_SYSTEM || w->l->team == w); 902105854Sjulian 903111028Sjeff /* If there is only one processor work can still be stolen. 904111028Sjeff There must be only one worker to prevent stealing. */ 905111028Sjeff CILK_ASSERT(w->g->total_workers > 1); 906105854Sjulian 907105854Sjulian /* pick random *other* victim */ 908111028Sjeff n = myrand(w) % (w->g->total_workers - 1); 909111028Sjeff if (n >= w->self) 910105854Sjulian ++n; 911105854Sjulian 91299026Sjulian // If we're replaying a log, override the victim. -1 indicates that 913105854Sjulian // we've exhausted the list of things this worker stole when we recorded 914105854Sjulian // the log so just return. If we're not replaying a log, 915105854Sjulian // replay_get_next_recorded_victim() just returns the victim ID passed in. 916105854Sjulian n = replay_get_next_recorded_victim(w, n); 917105854Sjulian if (-1 == n) 918105854Sjulian return; 919105854Sjulian 920105854Sjulian victim = w->g->workers[n]; 921105854Sjulian 922105854Sjulian START_INTERVAL(w, INTERVAL_FIBER_ALLOCATE) { 923111028Sjeff /* Verify that we can get a stack. If not, no need to continue. */ 924111028Sjeff fiber = cilk_fiber_allocate(&w->l->fiber_pool); 925111028Sjeff } STOP_INTERVAL(w, INTERVAL_FIBER_ALLOCATE); 926111028Sjeff 927111028Sjeff 92899026Sjulian if (NULL == fiber) { 92999026Sjulian#if FIBER_DEBUG >= 2 93099026Sjulian fprintf(stderr, "w=%d: failed steal because we could not get a fiber\n", 93199026Sjulian w->self); 932103367Sjulian#endif 933103367Sjulian return; 934103367Sjulian } 935103367Sjulian 936103367Sjulian /* do not steal from self */ 937111119Simp CILK_ASSERT (victim != w); 938103367Sjulian 939103367Sjulian /* Execute a quick check before engaging in the THE protocol. 940103367Sjulian Avoid grabbing locks if there is nothing to steal. */ 941103367Sjulian if (!can_steal_from(victim)) { 942103367Sjulian NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_EMPTYQ); 943103367Sjulian START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) { 944103367Sjulian int ref_count = cilk_fiber_remove_reference(fiber, &w->l->fiber_pool); 945103367Sjulian // Fibers we use when trying to steal should not be active, 946111119Simp // and thus should not have any other references. 947103367Sjulian CILK_ASSERT(0 == ref_count); 948103367Sjulian } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE); 949103367Sjulian return; 95099026Sjulian } 95199026Sjulian 95299026Sjulian /* Attempt to steal work from the victim */ 95399026Sjulian if (worker_trylock_other(w, victim)) { 95499026Sjulian if (w->l->type == WORKER_USER && victim->l->team != w) { 95599026Sjulian 956111119Simp // Fail to steal if this is a user worker and the victim is not 95799026Sjulian // on this team. If a user worker were allowed to steal work 95899026Sjulian // descended from another user worker, the former might not be 95999026Sjulian // done with its work by the time it was needed to resume and 960103367Sjulian // unbind. Therefore, user workers are not permitted to change 961103367Sjulian // teams. 962103367Sjulian 963103367Sjulian // There is no race on the victim's team because the victim cannot 964103367Sjulian // change its team until it runs out of work to do, at which point 965103367Sjulian // it will try to take out its own lock, and this worker already 966103367Sjulian // holds it. 967103367Sjulian NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_USER_WORKER); 968103367Sjulian 969103367Sjulian } else if (victim->l->frame_ff) { 970103367Sjulian // A successful steal will change victim->frame_ff, even 971103367Sjulian // though the victim may be executing. Thus, the lock on 972103367Sjulian // the victim's deque is also protecting victim->frame_ff. 973103367Sjulian if (dekker_protocol(victim)) { 974103367Sjulian int proceed_with_steal = 1; // optimistic 975103367Sjulian 976103367Sjulian // If we're replaying a log, verify that this the correct frame 977103367Sjulian // to steal from the victim 97899026Sjulian if (! replay_match_victim_pedigree(w, victim)) 97999026Sjulian { 98099026Sjulian // Abort the steal attempt. decrement_E(victim) to 98199026Sjulian // counter the increment_E(victim) done by the 98299026Sjulian // dekker protocol 983107719Sjulian decrement_E(victim); 984107719Sjulian proceed_with_steal = 0; 98599026Sjulian } 98699026Sjulian 98799026Sjulian if (proceed_with_steal) 98899026Sjulian { 98999026Sjulian START_INTERVAL(w, INTERVAL_STEAL_SUCCESS) { 990104031Sjulian success = 1; 991104031Sjulian detach_for_steal(w, victim, fiber); 99299026Sjulian victim_id = victim->self; 99399026Sjulian 99499026Sjulian #if REDPAR_DEBUG >= 1 99599026Sjulian fprintf(stderr, "Wkr %d stole from victim %d, fiber = %p\n", 996104503Sjmallett w->self, victim->self, fiber); 997104031Sjulian #endif 998104031Sjulian 999104031Sjulian // The use of victim->self contradicts our 1000116963Sdavidxu // classification of the "self" field as 1001115790Sjulian // local. But since this code is only for 100299026Sjulian // debugging, it is ok. 1003104503Sjmallett DBGPRINTF ("%d-%p: Stealing work from worker %d\n" 1004104503Sjmallett " sf: %p, call parent: %p\n", 1005104503Sjmallett w->self, GetCurrentFiber(), victim->self, 1006104031Sjulian w->l->next_frame_ff->call_stack, 1007115790Sjulian w->l->next_frame_ff->call_stack->call_parent); 1008115790Sjulian } STOP_INTERVAL(w, INTERVAL_STEAL_SUCCESS); 1009115790Sjulian } // end if(proceed_with_steal) 1010115790Sjulian } else { 1011108338Sjulian NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_DEKKER); 1012104031Sjulian } 1013111028Sjeff } else { 1014111028Sjeff NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_EMPTYQ); 1015117000Smarcel } 1016117000Smarcel worker_unlock_other(w, victim); 1017115790Sjulian } else { 1018111028Sjeff NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_LOCK); 1019115790Sjulian } 1020111028Sjeff 1021116963Sdavidxu // Record whether work was stolen. When true, this will flag 1022116963Sdavidxu // setup_for_execution_pedigree to increment the pedigree 1023116963Sdavidxu w->l->work_stolen = success; 1024116963Sdavidxu 1025116963Sdavidxu if (0 == success) { 1026116963Sdavidxu // failed to steal work. Return the fiber to the pool. 1027116963Sdavidxu START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) { 1028116963Sdavidxu int ref_count = cilk_fiber_remove_reference(fiber, &w->l->fiber_pool); 1029116963Sdavidxu // Fibers we use when trying to steal should not be active, 1030116963Sdavidxu // and thus should not have any other references. 1031116963Sdavidxu CILK_ASSERT(0 == ref_count); 1032116963Sdavidxu } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE); 1033116963Sdavidxu } 1034116963Sdavidxu else 1035116963Sdavidxu { 1036116963Sdavidxu // Since our steal was successful, finish initialization of 1037116963Sdavidxu // the fiber. 1038111028Sjeff cilk_fiber_reset_state(fiber, 1039104031Sjulian fiber_proc_to_resume_user_code_for_random_steal); 1040104031Sjulian // Record the pedigree of the frame that w has stolen. 1041104031Sjulian // record only if CILK_RECORD_LOG is set 1042104031Sjulian replay_record_steal(w, victim_id); 1043104031Sjulian } 1044104031Sjulian} 1045104031Sjulian 1046104031Sjulian 1047108338Sjulian 1048107034Sdavidxu/** 1049104031Sjulian * At a provably good steal, we need to transfer the child reducer map 1050104126Sjulian * from ff->children_reducer_map into v->reducer_map, where v is the 1051104031Sjulian * worker that resumes execution of ff. 1052104031Sjulian * 1053111028Sjeff * Normally, we have v == w, where w is the currently executing 1054111028Sjeff * worker. In the case where we are resuming a team leader on a user 1055111028Sjeff * worker, however, v might differ from w. 1056111028Sjeff 1057111028Sjeff * Thus, this, operation is a no-op, since we can't really move 1058111028Sjeff * ff->children_reducer_map into w here. 1059111028Sjeff * 1060104126Sjulian * Instead, this work is done in setup_for_execution_reducers(). 1061104031Sjulian */ 1062104031Sjulianstatic inline void provably_good_steal_reducers(__cilkrts_worker *w, 1063104126Sjulian full_frame *ff) 1064104031Sjulian{ 1065111028Sjeff // No-op. 1066104031Sjulian} 1067107034Sdavidxu 1068107034Sdavidxu/* at a provably good steal, incorporate the accumulated exceptions of 1069107034Sdavidxu children into the parent's exception */ 1070107034Sdavidxustatic void provably_good_steal_exceptions(__cilkrts_worker *w, 1071107034Sdavidxu full_frame *ff) 1072111028Sjeff{ 1073111028Sjeff // ASSERT: we own ff->lock 1074111028Sjeff ff->pending_exception = 1075108338Sjulian __cilkrts_merge_pending_exceptions(w, 1076104031Sjulian ff->child_pending_exception, 107799026Sjulian ff->pending_exception); 1078104031Sjulian ff->child_pending_exception = NULL; 1079104031Sjulian} 1080111028Sjeff 1081104031Sjulian/* At sync discard the frame's old stack and take the leftmost child's. */ 1082104031Sjulianstatic void provably_good_steal_stacks(__cilkrts_worker *w, full_frame *ff) 1083111028Sjeff{ 1084104031Sjulian CILK_ASSERT(NULL == ff->fiber_self); 1085104126Sjulian ff->fiber_self = ff->fiber_child; 1086104031Sjulian ff->fiber_child = NULL; 1087104031Sjulian} 1088104031Sjulian 1089111028Sjeffstatic void __cilkrts_mark_synched(full_frame *ff) 1090104031Sjulian{ 1091104031Sjulian ff->call_stack->flags &= ~CILK_FRAME_UNSYNCHED; 1092104031Sjulian ff->simulated_stolen = 0; 1093104126Sjulian} 1094104126Sjulian 1095104126Sjulianstatic 1096104031Sjulianenum provably_good_steal_t provably_good_steal(__cilkrts_worker *w, 1097104031Sjulian full_frame *ff) 1098104126Sjulian{ 1099104031Sjulian // ASSERT: we hold w->lock and ff->lock 1100104031Sjulian 1101104126Sjulian enum provably_good_steal_t result = ABANDON_EXECUTION; 1102104031Sjulian 1103104031Sjulian // If the current replay entry is a sync record matching the worker's 1104104126Sjulian // pedigree, AND this isn't the last child to the sync, return 110599026Sjulian // WAIT_FOR_CONTINUE to indicate that the caller should loop until 1106104031Sjulian // we find the right frame to steal and CONTINUE_EXECUTION is returned. 110799026Sjulian int match_found = replay_match_sync_pedigree(w); 110899026Sjulian if (match_found && (0 != simulate_decjoin(ff))) 110999026Sjulian return WAIT_FOR_CONTINUE; 1110107034Sdavidxu 1111107034Sdavidxu START_INTERVAL(w, INTERVAL_PROVABLY_GOOD_STEAL) { 1112107034Sdavidxu if (decjoin(ff) == 0) { 1113111028Sjeff provably_good_steal_reducers(w, ff); 1114107034Sdavidxu provably_good_steal_exceptions(w, ff); 1115107034Sdavidxu provably_good_steal_stacks(w, ff); 1116116401Sdavidxu __cilkrts_mark_synched(ff); 1117107034Sdavidxu 1118116401Sdavidxu // If the original owner wants this frame back (to resume 1119116401Sdavidxu // it on its original thread) pass it back now. 1120107034Sdavidxu if (NULL != ff->sync_master) { 1121107034Sdavidxu // The frame wants to go back and be executed by the original 1122111976Sdavidxu // user thread. We can throw caution to the wind and push the 1123111032Sjulian // frame straight onto its queue because the only way we have 1124111976Sdavidxu // gotten to this point of being able to continue execution of 1125111028Sjeff // the frame is if the original user worker is spinning without 1126107034Sdavidxu // work. 1127107034Sdavidxu 1128111028Sjeff unset_sync_master(w->l->team, ff); 1129111028Sjeff __cilkrts_push_next_frame(w->l->team, ff); 1130111028Sjeff 1131111028Sjeff // If this is the team leader we're not abandoning the work 1132111028Sjeff if (w == w->l->team) 1133111028Sjeff result = CONTINUE_EXECUTION; 1134111028Sjeff } else { 1135111028Sjeff __cilkrts_push_next_frame(w, ff); 1136111028Sjeff result = CONTINUE_EXECUTION; // Continue working on this thread 1137107034Sdavidxu } 1138111028Sjeff 1139107034Sdavidxu // The __cilkrts_push_next_frame() call changes ownership 1140107034Sdavidxu // of ff to the specified worker. 1141111028Sjeff } 1142111515Sdavidxu } STOP_INTERVAL(w, INTERVAL_PROVABLY_GOOD_STEAL); 1143111028Sjeff 1144107034Sdavidxu // Only write a SYNC record if: 1145111515Sdavidxu // - We're recording a log *AND* 1146107034Sdavidxu // - We're the worker continuing from this sync 1147107034Sdavidxu replay_record_sync(w, result == CONTINUE_EXECUTION); 1148107034Sdavidxu 1149111028Sjeff // If we're replaying a log, and matched a sync from the log, mark the 1150112397Sdavidxu // sync record seen if the sync isn't going to be abandoned. 1151107034Sdavidxu replay_advance_from_sync (w, match_found, result == CONTINUE_EXECUTION); 1152111028Sjeff 1153107034Sdavidxu return result; 1154111028Sjeff} 1155111028Sjeff 1156111028Sjeffstatic void unconditional_steal(__cilkrts_worker *w, 1157111028Sjeff full_frame *ff) 1158107034Sdavidxu{ 1159111028Sjeff // ASSERT: we hold ff->lock 1160111515Sdavidxu 1161111515Sdavidxu START_INTERVAL(w, INTERVAL_UNCONDITIONAL_STEAL) { 1162111515Sdavidxu decjoin(ff); 1163111515Sdavidxu __cilkrts_push_next_frame(w, ff); 1164111515Sdavidxu } STOP_INTERVAL(w, INTERVAL_UNCONDITIONAL_STEAL); 1165111515Sdavidxu} 1166111515Sdavidxu 1167111515Sdavidxu 1168111515Sdavidxu/* CHILD is about to die. Give its exceptions to a sibling or to the 1169107034Sdavidxu parent. */ 1170117000Smarcelstatic inline void splice_exceptions_for_call(__cilkrts_worker *w, 1171111028Sjeff full_frame *parent_ff, 1172111028Sjeff full_frame *child_ff) 1173111028Sjeff{ 1174111028Sjeff // ASSERT: We own parent_ff->lock 1175111028Sjeff CILK_ASSERT(child_ff->is_call_child); 1176107034Sdavidxu CILK_ASSERT(NULL == child_ff->right_pending_exception); 1177112397Sdavidxu CILK_ASSERT(NULL == parent_ff->pending_exception); 1178112397Sdavidxu 1179112397Sdavidxu parent_ff->pending_exception = child_ff->pending_exception; 1180112397Sdavidxu child_ff->pending_exception = NULL; 1181112397Sdavidxu} 1182112397Sdavidxu 1183111028Sjeff/** 1184111028Sjeff * Merge exceptions for a dying child. 1185111028Sjeff * 1186111028Sjeff * @param w The currently executing worker. 118799026Sjulian * @param ff The child frame that is dying. 118899026Sjulian * @param left_exception_ptr Pointer to the exception that is to our left. 118999026Sjulian */ 1190107719Sjulianstatic inline 1191107719Sjulianvoid splice_exceptions_for_spawn(__cilkrts_worker *w, 1192107719Sjulian full_frame *ff, 119399026Sjulian struct pending_exception_info **left_exception_ptr) 119499026Sjulian{ 119599026Sjulian // ASSERT: parent_ff == child_ff->parent. 119699026Sjulian // ASSERT: We own parent_ff->lock 119799026Sjulian 119899026Sjulian // Merge current exception into the slot where the left 119999026Sjulian // exception should go. 120099026Sjulian *left_exception_ptr = 120199026Sjulian __cilkrts_merge_pending_exceptions(w, 120299026Sjulian *left_exception_ptr, 120399026Sjulian ff->pending_exception); 120499026Sjulian ff->pending_exception = NULL; 120599026Sjulian 120699026Sjulian 120799026Sjulian // Merge right exception into the slot where the left exception 1208102581Sjulian // should go. 1209102581Sjulian *left_exception_ptr = 1210102581Sjulian __cilkrts_merge_pending_exceptions(w, 121199026Sjulian *left_exception_ptr, 121299026Sjulian ff->right_pending_exception); 121399026Sjulian ff->right_pending_exception = NULL; 121499026Sjulian} 1215104695Sjulian 1216104695Sjulian 1217104695Sjulianstatic inline void splice_stacks_for_call(__cilkrts_worker *w, 1218104695Sjulian full_frame *parent_ff, 1219104695Sjulian full_frame *child_ff) 122099026Sjulian{ 122199026Sjulian#if CILK_LIB_DEBUG 1222102581Sjulian if (parent_ff->call_stack) 1223103002Sjulian CILK_ASSERT(!(parent_ff->call_stack->flags & CILK_FRAME_MBZ)); 1224103002Sjulian#endif 1225103002Sjulian 1226102581Sjulian /* A synched frame does not have accumulated child reducers. */ 1227103002Sjulian CILK_ASSERT(!child_ff->fiber_child); 1228113641Sjulian CILK_ASSERT(child_ff->is_call_child); 1229111115Sdavidxu 1230111115Sdavidxu /* An attached parent has no self fiber. It may have 1231103002Sjulian accumulated child fibers or child owners, which should be 1232103002Sjulian ignored until sync. */ 1233103002Sjulian CILK_ASSERT(!parent_ff->fiber_self); 1234103002Sjulian parent_ff->fiber_self = child_ff->fiber_self; 1235103002Sjulian child_ff->fiber_self = NULL; 1236103002Sjulian} 1237103002Sjulian 1238103216Sjulianstatic void finalize_child_for_call(__cilkrts_worker *w, 1239103002Sjulian full_frame *parent_ff, 124099026Sjulian full_frame *child_ff) 1241104695Sjulian{ 1242111028Sjeff // ASSERT: we hold w->lock and parent_ff->lock 1243111028Sjeff 1244111028Sjeff START_INTERVAL(w, INTERVAL_FINALIZE_CHILD) { 1245111028Sjeff CILK_ASSERT(child_ff->is_call_child); 1246111028Sjeff CILK_ASSERT(child_ff->join_counter == 0); 1247111028Sjeff CILK_ASSERT(!child_ff->rightmost_child); 1248111028Sjeff CILK_ASSERT(child_ff == parent_ff->rightmost_child); 1249111028Sjeff 1250111028Sjeff // CHILD is about to die. 1251111028Sjeff // Splicing out reducers is a no-op for a call since 1252111028Sjeff // w->reducer_map should already store the correct 1253104695Sjulian // reducer map. 1254111028Sjeff 1255104695Sjulian // ASSERT there are no maps left to reduce. 1256108338Sjulian CILK_ASSERT(NULL == child_ff->children_reducer_map); 1257104695Sjulian CILK_ASSERT(NULL == child_ff->right_reducer_map); 1258111028Sjeff 1259105854Sjulian splice_exceptions_for_call(w, parent_ff, child_ff); 1260111028Sjeff 1261105854Sjulian splice_stacks_for_call(w, parent_ff, child_ff); 1262105854Sjulian 1263111028Sjeff /* remove CHILD from list of children of PARENT */ 1264105854Sjulian unlink_child(parent_ff, child_ff); 1265113244Sdavidxu 1266105854Sjulian /* continue with the parent. */ 1267113244Sdavidxu unconditional_steal(w, parent_ff); 1268105854Sjulian __cilkrts_destroy_full_frame(w, child_ff); 1269105854Sjulian } STOP_INTERVAL(w, INTERVAL_FINALIZE_CHILD); 1270107719Sjulian} 1271103002Sjulian 1272103002Sjulian 127399026Sjulian/** 1274112888Sjeff * The invariant on ff->children_reducer_map is that when ff is 1275112993Speter * synched and when we are about to resume execution of ff, at least 1276115084Smarcel * one of ff->children_reducer_map and w->reducer_map must be NULL. 1277112993Speter * 1278112993Speter * Consider the two possibilities before resuming execution of ff: 127999026Sjulian * 1280112993Speter * 1. Suppose ff is synched and suspended. Then either 1281112993Speter * 128299026Sjulian * (a) ff->children_reducer_map stores the reducer map that w 128399026Sjulian * should use, where w is the worker resuming execution of ff, 128499026Sjulian * OR 1285107719Sjulian * (b) w already has a user map, and ff->children_reducer_map is NULL. 1286107719Sjulian * 1287107719Sjulian * Case (a) happens when we are resuming execution of ff as a 1288107719Sjulian * provably good steal. In this case, w->reducer_map should be 1289107719Sjulian * NULL and ff->children_reducer_map is valid. To resume 1290107719Sjulian * execution of ff on w, set w->reducer_map to 1291107719Sjulian * ff->children_reducer_map. 1292107719Sjulian * 1293107719Sjulian * Case (b) occurs when we resume execution of ff because ff is a 1294107719Sjulian * called child. Then, ff->children_reducer_map should be NULL, 1295107719Sjulian * and w should already have a valid reducer map when resuming 1296107719Sjulian * execution of ff. We resume execution of ff without changing 1297107719Sjulian * w->reducer_map. 1298107719Sjulian * 1299107719Sjulian * 2. Suppose frame ff is not synched (i.e., it is active and might have 1300107719Sjulian * active children). Then ff->children_reducer_map is the slot for 1301107719Sjulian * storing the reducer map from ff's leftmost child, as in the reducer 1302107719Sjulian * protocol. The runtime may resume execution of ff while it is not 1303107719Sjulian * synched only because of a steal. 1304107719Sjulian * In this case, while we are resuming ff, ff->children_reducer_map 1305107719Sjulian * may be non-NULL (because one of ff's children has completed). 130699026Sjulian * We resume execution of ff without changing w->reducer_map. 130799026Sjulian */ 1308103002Sjulianstatic void setup_for_execution_reducers(__cilkrts_worker *w, 1309103002Sjulian full_frame *ff) 131099026Sjulian{ 131199026Sjulian // We only need to move ff->children_reducer_map into 131299026Sjulian // w->reducer_map in case 1(a). 131399026Sjulian // 131499026Sjulian // First check whether ff is synched. 131599026Sjulian __cilkrts_stack_frame *sf = ff->call_stack; 131699026Sjulian if (!(sf->flags & CILK_FRAME_UNSYNCHED)) { 131799026Sjulian // In this case, ff is synched. (Case 1). 131899026Sjulian CILK_ASSERT(!ff->rightmost_child); 131999026Sjulian 1320111028Sjeff // Test whether we are in case 1(a) and have 1321111028Sjeff // something to do. Note that if both 1322111028Sjeff // ff->children_reducer_map and w->reducer_map are NULL, we 1323111028Sjeff // can't distinguish between cases 1(a) and 1(b) here. 1324111028Sjeff if (ff->children_reducer_map) { 1325111028Sjeff // We are in Case 1(a). 132699026Sjulian CILK_ASSERT(!w->reducer_map); 1327103002Sjulian w->reducer_map = ff->children_reducer_map; 1328103002Sjulian ff->children_reducer_map = NULL; 132999026Sjulian } 133099026Sjulian } 133199026Sjulian} 133299026Sjulian 133399026Sjulianstatic void setup_for_execution_exceptions(__cilkrts_worker *w, 133499026Sjulian full_frame *ff) 1335113641Sjulian{ 1336113641Sjulian CILK_ASSERT(NULL == w->l->pending_exception); 1337113641Sjulian w->l->pending_exception = ff->pending_exception; 1338113641Sjulian ff->pending_exception = NULL; 1339113641Sjulian} 1340113920Sjhb 1341113920Sjhb#if 0 /* unused */ 1342113641Sjulianstatic void setup_for_execution_stack(__cilkrts_worker *w, 1343113641Sjulian full_frame *ff) 1344113641Sjulian{ 1345113641Sjulian} 1346113641Sjulian#endif 1347113641Sjulian 1348113641Sjulian/* 1349111028Sjeff * setup_for_execution_pedigree 1350111028Sjeff * 1351111028Sjeff * Copies the pedigree information from the frame we're resuming to the 1352111028Sjeff * worker. Increments the pedigree if this is work that has been stolen 1353113864Sjhb * to match the increment on a return from a spawn helper. 1354111028Sjeff */ 1355111028Sjeffstatic void setup_for_execution_pedigree(__cilkrts_worker *w) 1356111028Sjeff{ 1357111028Sjeff int pedigree_unsynched; 1358111028Sjeff __cilkrts_stack_frame *sf = w->current_stack_frame; 1359111028Sjeff 1360111028Sjeff CILK_ASSERT(NULL != sf); 1361111028Sjeff 1362111028Sjeff // If this isn't an ABI 1 or later frame, there's no pedigree information 1363111028Sjeff if (0 == CILK_FRAME_VERSION_VALUE(sf->flags)) 1364111028Sjeff return; 1365111028Sjeff 1366111028Sjeff // Note whether the pedigree is unsynched and clear the flag before 1367111028Sjeff // we forget 1368111028Sjeff pedigree_unsynched = sf->flags & CILK_FRAME_SF_PEDIGREE_UNSYNCHED; 1369111028Sjeff sf->flags &= ~CILK_FRAME_SF_PEDIGREE_UNSYNCHED; 1370111028Sjeff 1371111028Sjeff // If we're just marshalling onto this worker, do not increment 1372111028Sjeff // the rank since that wouldn't happen in a sequential execution 1373111028Sjeff if (w->l->work_stolen || pedigree_unsynched) 1374111028Sjeff { 1375111028Sjeff if (w->l->work_stolen) 1376111028Sjeff w->pedigree.rank = sf->parent_pedigree.rank + 1; 1377111028Sjeff else 1378113864Sjhb w->pedigree.rank = sf->parent_pedigree.rank; 1379105854Sjulian } 1380105854Sjulian 1381105854Sjulian w->pedigree.parent = sf->parent_pedigree.parent; 1382111028Sjeff w->l->work_stolen = 0; 1383105854Sjulian} 1384105854Sjulian 1385105854Sjulianstatic void setup_for_execution(__cilkrts_worker *w, 1386105854Sjulian full_frame *ff, 1387105854Sjulian int is_return_from_call) 1388111028Sjeff{ 1389111028Sjeff // ASSERT: We own w->lock and ff->lock || P == 1 1390111028Sjeff 1391111028Sjeff setup_for_execution_reducers(w, ff); 1392111028Sjeff setup_for_execution_exceptions(w, ff); 1393111028Sjeff /*setup_for_execution_stack(w, ff);*/ 1394111028Sjeff 1395111028Sjeff ff->call_stack->worker = w; 1396111028Sjeff w->current_stack_frame = ff->call_stack; 1397111028Sjeff 1398111028Sjeff // If this is a return from a call, leave the pedigree alone 1399111028Sjeff if (! is_return_from_call) 1400111028Sjeff setup_for_execution_pedigree(w); 1401111028Sjeff 1402105854Sjulian __cilkrts_setup_for_execution_sysdep(w, ff); 1403111028Sjeff 1404111028Sjeff w->head = w->tail = w->l->ltq; 1405111028Sjeff reset_THE_exception(w); 1406111028Sjeff 1407111028Sjeff make_runnable(w, ff); 1408111028Sjeff} 1409111028Sjeff 1410105854Sjulian 1411105854Sjulian/* 1412105854Sjulian * Called by the scheduling fiber, right before 1413105854Sjulian * resuming a sf/ff for user code. 1414105854Sjulian * 1415105854Sjulian * This method associates the specified sf with the worker. 1416111028Sjeff * 1417111028Sjeff * It also asserts that w, ff, and sf all have the expected properties 1418111028Sjeff * for resuming user code. 1419111028Sjeff */ 1420111028Sjeffvoid scheduling_fiber_prepare_to_resume_user_code(__cilkrts_worker *w, 1421111028Sjeff full_frame *ff, 1422111028Sjeff __cilkrts_stack_frame *sf) 1423111028Sjeff{ 1424111028Sjeff w->current_stack_frame = sf; 1425111028Sjeff sf->worker = w; 1426111028Sjeff 1427111028Sjeff // Lots of debugging checks on the state of the fiber we might be 1428111028Sjeff // resuming. 1429111028Sjeff#if FIBER_DEBUG >= 1 1430111028Sjeff# if FIBER_DEBUG >= 3 1431111028Sjeff { 1432111028Sjeff fprintf(stderr, "w=%d: ff=%p, sf=%p. about to resume user code\n", 1433111028Sjeff w->self, ff, sf); 1434105854Sjulian } 143599026Sjulian# endif 1436103410Smini 1437108338Sjulian const int flags = sf->flags; 143899026Sjulian CILK_ASSERT(flags & CILK_FRAME_SUSPENDED); 143999026Sjulian CILK_ASSERT(!sf->call_parent); 1440111028Sjeff CILK_ASSERT(w->head == w->tail); 144199026Sjulian 144299026Sjulian /* A frame can not be resumed unless it was suspended. */ 144399026Sjulian CILK_ASSERT(ff->sync_sp != NULL); 144499026Sjulian 1445104695Sjulian /* The leftmost frame has no allocated stack */ 1446104695Sjulian if (ff->simulated_stolen) 1447111028Sjeff CILK_ASSERT(flags & CILK_FRAME_UNSYNCHED); 1448111028Sjeff else if (flags & CILK_FRAME_UNSYNCHED) 1449111028Sjeff /* XXX By coincidence sync_sp could be null. */ 1450104695Sjulian CILK_ASSERT(ff->fiber_self != NULL); 1451111028Sjeff else 1452104695Sjulian /* XXX This frame could be resumed unsynched on the leftmost stack */ 1453104695Sjulian CILK_ASSERT((ff->sync_master == 0 || ff->sync_master == w)); 145499026Sjulian CILK_ASSERT(w->l->frame_ff == ff); 1455111028Sjeff#endif 1456106182Sdavidxu} 145799026Sjulian 145899026Sjulian 1459104695Sjulian/** 1460103002Sjulian * This method is the first method that should execute after we've 1461103002Sjulian * switched to a scheduling fiber from user code. 1462111028Sjeff * 1463113244Sdavidxu * @param fiber The scheduling fiber for the current worker. 1464115858Smarcel * @param wptr The current worker. 1465111028Sjeff */ 1466111028Sjeffstatic void enter_runtime_transition_proc(cilk_fiber *fiber) 1467111028Sjeff{ 1468116401Sdavidxu // We can execute this method for one of three reasons: 1469116372Sdavidxu // 1. Undo-detach finds parent stolen. 1470111028Sjeff // 2. Sync suspends frame. 1471111028Sjeff // 3. Return from Cilk entry point. 1472104695Sjulian // 1473116963Sdavidxu // 1474116963Sdavidxu // In cases 1 and 2, the frame may be truly suspended or 1475104695Sjulian // may be immediately executed by this worker after provably_good_steal. 147699026Sjulian // 147799026Sjulian // 1478116963Sdavidxu // There is a fourth case, which can, but does not need to execute 1479116963Sdavidxu // this function: 1480116963Sdavidxu // 4. Starting up the scheduling loop on a user or 1481116963Sdavidxu // system worker. In this case, we won't have 1482111033Sjeff // a scheduling stack function to run. 1483111033Sjeff __cilkrts_worker* w = cilk_fiber_get_owner(fiber); 1484103410Smini if (w->l->post_suspend) { 1485111033Sjeff // Run the continuation function passed to longjmp_into_runtime 1486116963Sdavidxu run_scheduling_stack_fcn(w); 1487116963Sdavidxu 1488103410Smini // After we have jumped into the runtime and run the 1489103410Smini // scheduling function, any reducer map the worker had before entering the runtime 1490115884Sdavidxu // should have already been saved into the appropriate full 1491115884Sdavidxu // frame. 1492116963Sdavidxu CILK_ASSERT(NULL == w->reducer_map); 1493116963Sdavidxu 1494116963Sdavidxu // There shouldn't be any uncaught exceptions. 1495116963Sdavidxu // 1496116963Sdavidxu // In Windows, the OS catches any exceptions not caught by the 1497103410Smini // user code. Thus, we are omitting the check on Windows. 1498116963Sdavidxu // 1499116963Sdavidxu // On Android, calling std::uncaught_exception with the stlport 1500116963Sdavidxu // library causes a seg fault. Since we're not supporting 1501116963Sdavidxu // exceptions there at this point, just don't do the check 1502116963Sdavidxu // 1503103410Smini // TBD: Is this check also safe to do on Windows? 1504116963Sdavidxu CILKBUG_ASSERT_NO_UNCAUGHT_EXCEPTION(); 1505116963Sdavidxu } 1506111033Sjeff} 1507111033Sjeff 1508111033Sjeff 1509112397Sdavidxu/** 1510112397Sdavidxu * Method called to jump back to executing user code. 1511112397Sdavidxu * 1512116607Sdavidxu * A normal return from the runtime back to resuming user code calls 1513112397Sdavidxu * this method. A computation executed using force_reduce also calls 1514112397Sdavidxu * this method to return to user code. 1515112397Sdavidxu * 1516112397Sdavidxu * This function should not contain any code that depends on a fiber. 1517112397Sdavidxu * In a force-reduce case, the user worker may not have a fiber. In 1518112397Sdavidxu * the force-reduce case, we call this method directly instead of 1519112397Sdavidxu * calling @c user_code_resume_after_switch_into_runtime. 1520112397Sdavidxu */ 1521112397Sdavidxustatic inline NORETURN 1522112397Sdavidxucilkrts_resume(__cilkrts_stack_frame *sf, full_frame *ff) 1523112397Sdavidxu{ 1524112397Sdavidxu // Save the sync stack pointer, and do the bookkeeping 1525112397Sdavidxu char* sync_sp = ff->sync_sp; 1526112397Sdavidxu __cilkrts_take_stack(ff, sync_sp); // leaves ff->sync_sp null 1527112397Sdavidxu 1528112397Sdavidxu sf->flags &= ~CILK_FRAME_SUSPENDED; 1529112397Sdavidxu // Actually longjmp to the user code. 1530112397Sdavidxu // We may have exceptions to deal with, since we are resuming 1531112397Sdavidxu // a previous-suspended frame. 1532112397Sdavidxu sysdep_longjmp_to_sf(sync_sp, sf, ff); 1533112397Sdavidxu} 1534112397Sdavidxu 1535112397Sdavidxu 1536112397Sdavidxu/** 1537116607Sdavidxu * Called by the user-code fiber right before resuming a full frame 1538116607Sdavidxu * (sf/ff). 1539112397Sdavidxu * 1540112397Sdavidxu * This method pulls sf/ff out of the worker, and then calls 1541112397Sdavidxu * cilkrts_resume to jump to user code. 1542103410Smini */ 1543111028Sjeffstatic NORETURN 1544105900Sjulianuser_code_resume_after_switch_into_runtime(cilk_fiber *fiber) 1545105900Sjulian{ 1546105900Sjulian __cilkrts_worker *w = cilk_fiber_get_owner(fiber); 1547105900Sjulian __cilkrts_stack_frame *sf; 1548105900Sjulian full_frame *ff; 1549111028Sjeff sf = w->current_stack_frame; 1550111028Sjeff ff = sf->worker->l->frame_ff; 1551113793Sdavidxu 1552105900Sjulian#if FIBER_DEBUG >= 1 1553111028Sjeff CILK_ASSERT(ff->fiber_self == fiber); 1554113793Sdavidxu cilk_fiber_data *fdata = cilk_fiber_get_data(fiber); 1555105900Sjulian DBGPRINTF ("%d-%p: resume_after_switch_into_runtime, fiber=%p\n", 1556105900Sjulian w->self, w, fiber); 1557105900Sjulian CILK_ASSERT(sf == fdata->resume_sf); 1558105900Sjulian#endif 1559116401Sdavidxu 1560116401Sdavidxu // Notify the Intel tools that we're stealing code 1561105900Sjulian ITT_SYNC_ACQUIRED(sf->worker); 1562112071Sdavidxu NOTIFY_ZC_INTRINSIC("cilk_continue", sf); 1563105900Sjulian cilk_fiber_invoke_tbb_stack_op(fiber, CILK_TBB_STACK_ADOPT); 1564105900Sjulian 1565105900Sjulian // Actually jump to user code. 1566105900Sjulian cilkrts_resume(sf, ff); 1567105900Sjulian } 1568105900Sjulian 1569105900Sjulian 1570108338Sjulian/* The current stack is about to either be suspended or destroyed. This 1571105900Sjulian * function will switch to the stack on which the scheduler is suspended and 1572105900Sjulian * resume running the scheduler within function do_work(). Upon waking up, 1573111028Sjeff * the scheduler will run the 'cont' function, using the supplied worker and 1574116401Sdavidxu * frame. 1575111028Sjeff */ 1576111028Sjeffstatic NORETURN 1577111028Sjefflongjmp_into_runtime(__cilkrts_worker *w, 1578113793Sdavidxu scheduling_stack_fcn_t fcn, 1579117000Smarcel __cilkrts_stack_frame *sf) 1580113793Sdavidxu{ 1581113793Sdavidxu full_frame *ff, *ff2; 1582111028Sjeff 1583105900Sjulian CILK_ASSERT(!w->l->post_suspend); 1584113793Sdavidxu ff = w->l->frame_ff; 1585111115Sdavidxu 1586111115Sdavidxu // If we've got only one worker, stealing shouldn't be possible. 1587111115Sdavidxu // Assume that this is a steal or return from spawn in a force-reduce case. 1588113793Sdavidxu // We don't have a scheduling stack to switch to, so call the continuation 1589113793Sdavidxu // function directly. 1590113793Sdavidxu if (1 == w->g->P) { 1591113793Sdavidxu fcn(w, ff, sf); 1592111115Sdavidxu 1593105900Sjulian /* The call to function c() will have pushed ff as the next frame. If 1594105900Sjulian * this were a normal (non-forced-reduce) execution, there would have 1595105900Sjulian * been a pop_next_frame call in a separate part of the runtime. We 1596105900Sjulian * must call pop_next_frame here to complete the push/pop cycle. */ 1597105900Sjulian ff2 = pop_next_frame(w); 1598103410Smini 1599103410Smini setup_for_execution(w, ff2, 0); 1600103410Smini scheduling_fiber_prepare_to_resume_user_code(w, ff2, w->current_stack_frame); 160199026Sjulian cilkrts_resume(w->current_stack_frame, ff2); 160299026Sjulian 160399026Sjulian// Suppress clang warning that the expression result is unused 160499026Sjulian#if defined(__clang__) && (! defined(__INTEL_COMPILER)) 1605103410Smini# pragma clang diagnostic push 1606103410Smini# pragma clang diagnostic ignored "-Wunused-value" 1607103410Smini#endif // __clang__ 160899026Sjulian /* no return */ 160999026Sjulian CILK_ASSERT(((void)"returned from __cilkrts_resume", 0)); 1610103838Sjulian#if defined(__clang__) && (! defined(__INTEL_COMPILER)) 161199026Sjulian# pragma clang diagnostic pop 1612113793Sdavidxu#endif // __clang__ 1613111028Sjeff } 1614111115Sdavidxu 1615104695Sjulian w->l->post_suspend = fcn; 1616107060Sdavidxu w->l->suspended_stack = sf; 161799026Sjulian 1618111028Sjeff ITT_SYNC_RELEASING(w); 1619110190Sjulian ITT_SYNC_PREPARE(w); 1620116401Sdavidxu 1621104695Sjulian#if FIBER_DEBUG >= 2 1622116401Sdavidxu fprintf(stderr, "ThreadId=%p, W=%d: about to switch into runtime... w->l->frame_ff = %p, sf=%p\n", 1623116401Sdavidxu cilkos_get_current_thread_id(), 1624111028Sjeff w->self, w->l->frame_ff, 1625108338Sjulian sf); 1626103410Smini#endif 1627111028Sjeff 1628111028Sjeff // Current fiber is either the (1) one we are about to free, 1629111028Sjeff // or (2) it has been passed up to the parent. 1630103410Smini cilk_fiber *current_fiber = ( w->l->fiber_to_free ? 1631111028Sjeff w->l->fiber_to_free : 1632111515Sdavidxu w->l->frame_ff->parent->fiber_child ); 1633111028Sjeff cilk_fiber_data* fdata = cilk_fiber_get_data(current_fiber); 1634111028Sjeff CILK_ASSERT(NULL == w->l->frame_ff->fiber_self); 1635111028Sjeff 1636116401Sdavidxu // Clear the sf in the current fiber for cleanliness, to prevent 1637111515Sdavidxu // us from accidentally resuming a bad sf. 1638111515Sdavidxu // Technically, resume_sf gets overwritten for a fiber when 1639111028Sjeff // we are about to resume it anyway. 1640108338Sjulian fdata->resume_sf = NULL; 1641113793Sdavidxu CILK_ASSERT(fdata->owner == w); 1642111028Sjeff 1643111028Sjeff // Set the function to execute immediately after switching to the 1644111028Sjeff // scheduling fiber, but before freeing any fibers. 1645111028Sjeff cilk_fiber_set_post_switch_proc(w->l->scheduling_fiber, 1646111028Sjeff enter_runtime_transition_proc); 1647111028Sjeff cilk_fiber_invoke_tbb_stack_op(current_fiber, CILK_TBB_STACK_ORPHAN); 1648108338Sjulian 1649111028Sjeff if (w->l->fiber_to_free) { 1650111028Sjeff // Case 1: we are freeing this fiber. We never 1651112888Sjeff // resume this fiber again after jumping into the runtime. 1652112077Sdavidxu w->l->fiber_to_free = NULL; 1653112397Sdavidxu 1654113708Sdavidxu // Extra check. Normally, the fiber we are about to switch to 1655112888Sjeff // should have a NULL owner. 1656111515Sdavidxu CILK_ASSERT(NULL == cilk_fiber_get_data(w->l->scheduling_fiber)->owner); 1657112222Sdavidxu#if FIBER_DEBUG >= 4 1658112397Sdavidxu fprintf(stderr, "ThreadId=%p, W=%d: about to switch into runtime.. current_fiber = %p, deallcoate, switch to fiber %p\n", 1659112222Sdavidxu cilkos_get_current_thread_id(), 1660112222Sdavidxu w->self, 1661112077Sdavidxu current_fiber, w->l->scheduling_fiber); 1662113793Sdavidxu#endif 1663112222Sdavidxu cilk_fiber_invoke_tbb_stack_op(current_fiber, CILK_TBB_STACK_RELEASE); 1664112222Sdavidxu NOTE_INTERVAL(w, INTERVAL_DEALLOCATE_RESUME_OTHER); 1665112077Sdavidxu cilk_fiber_remove_reference_from_self_and_resume_other(current_fiber, 1666108338Sjulian &w->l->fiber_pool, 1667112888Sjeff w->l->scheduling_fiber); 1668104695Sjulian // We should never come back here! 1669104695Sjulian CILK_ASSERT(0); 1670104695Sjulian } 1671111028Sjeff else { 1672104695Sjulian // Case 2: We are passing the fiber to our parent because we 1673108338Sjulian // are leftmost. We should come back later to 1674104695Sjulian // resume execution of user code. 1675113793Sdavidxu // 1676104695Sjulian // If we are not freeing a fiber, there we must be 1677104695Sjulian // returning from a spawn or processing an exception. The 1678111028Sjeff // "sync" path always frees a fiber. 1679111028Sjeff // 1680111028Sjeff // We must be the leftmost child, and by left holder logic, we 1681104695Sjulian // have already moved the current fiber into our parent full 1682116372Sdavidxu // frame. 1683113793Sdavidxu#if FIBER_DEBUG >= 2 1684116963Sdavidxu fprintf(stderr, "ThreadId=%p, W=%d: about to suspend self into runtime.. current_fiber = %p, deallcoate, switch to fiber %p\n", 1685116963Sdavidxu cilkos_get_current_thread_id(), 1686116963Sdavidxu w->self, 1687116963Sdavidxu current_fiber, w->l->scheduling_fiber); 1688116963Sdavidxu#endif 1689116963Sdavidxu 1690116963Sdavidxu NOTE_INTERVAL(w, INTERVAL_SUSPEND_RESUME_OTHER); 1691108338Sjulian 1692112071Sdavidxu cilk_fiber_suspend_self_and_resume_other(current_fiber, 1693112071Sdavidxu w->l->scheduling_fiber); 1694112071Sdavidxu // Resuming this fiber returns control back to 1695112071Sdavidxu // this function because our implementation uses OS fibers. 1696112071Sdavidxu // 1697112071Sdavidxu // On Unix, we could have the choice of passing the 1698112071Sdavidxu // user_code_resume_after_switch_into_runtime as an extra "resume_proc" 1699112071Sdavidxu // that resumes execution of user code instead of the 1700112071Sdavidxu // jumping back here, and then jumping back to user code. 1701112071Sdavidxu#if FIBER_DEBUG >= 2 1702112071Sdavidxu CILK_ASSERT(fdata->owner == __cilkrts_get_tls_worker()); 1703108338Sjulian#endif 1704111028Sjeff user_code_resume_after_switch_into_runtime(current_fiber); 1705104695Sjulian } 1706104695Sjulian} 1707116401Sdavidxu 1708111154Sdavidxu/* 1709111154Sdavidxu * Send a message to the children of the specified worker: run or wait. 1710111154Sdavidxu */ 1711111154Sdavidxustatic void notify_children(__cilkrts_worker *w, unsigned int msg) 1712111154Sdavidxu{ 1713113920Sjhb int child_num; 1714116184Sdavidxu __cilkrts_worker *child; 1715111154Sdavidxu int num_sys_workers = w->g->P - 1; 1716111154Sdavidxu 1717111154Sdavidxu // If worker is "n", then its children are 2n + 1, and 2n + 2. 1718111154Sdavidxu child_num = (w->self << 1) + 1; 1719111154Sdavidxu if (child_num < num_sys_workers) { 1720111154Sdavidxu child = w->g->workers[child_num]; 1721111154Sdavidxu CILK_ASSERT(child->l->signal_node); 1722111154Sdavidxu signal_node_msg(child->l->signal_node, msg); 1723111154Sdavidxu child_num++; 1724111154Sdavidxu if (child_num < num_sys_workers) { 1725114106Sdavidxu child = w->g->workers[child_num]; 1726116138Sdavidxu CILK_ASSERT(child->l->signal_node); 1727116184Sdavidxu signal_node_msg(child->l->signal_node, msg); 1728116184Sdavidxu } 1729116184Sdavidxu } 1730116184Sdavidxu} 1731116184Sdavidxu 1732116184Sdavidxu/* 1733111154Sdavidxu * Notify this worker's children that they need to wait. 1734116184Sdavidxu */ 1735113920Sjhbstatic void notify_children_wait(__cilkrts_worker *w) 1736111154Sdavidxu{ 1737111154Sdavidxu notify_children(w, 0); 1738111154Sdavidxu} 1739116372Sdavidxu 1740113793Sdavidxu/* 1741112397Sdavidxu * Notify this worker's children to run and start trying to steal. 1742108338Sjulian */ 1743108338Sjulianstatic void notify_children_run(__cilkrts_worker *w) 1744111028Sjeff{ 1745108338Sjulian notify_children(w, 1); 1746108338Sjulian} 1747108338Sjulian 1748108338Sjulian/** 1749104695Sjulian * A single "check" to find work, either on our queue or through a 1750116372Sdavidxu * steal attempt. This method checks our local queue once, and 1751116401Sdavidxu * performs one steal attempt. 1752116401Sdavidxu */ 1753111028Sjeffstatic full_frame* check_for_work(__cilkrts_worker *w) 1754116401Sdavidxu{ 1755116401Sdavidxu full_frame *ff = NULL; 1756111028Sjeff ff = pop_next_frame(w); 1757113793Sdavidxu // If there is no work on the queue, try to steal some. 1758113793Sdavidxu if (NULL == ff) { 1759113793Sdavidxu START_INTERVAL(w, INTERVAL_STEALING) { 1760113793Sdavidxu if (w->l->type != WORKER_USER && w->l->team != NULL) { 1761113793Sdavidxu // At this point, the worker knows for certain that it has run 1762113793Sdavidxu // out of work. Therefore, it loses its team affiliation. User 1763113793Sdavidxu // workers never change teams, of course. 1764113793Sdavidxu __cilkrts_worker_lock(w); 1765113793Sdavidxu w->l->team = NULL; 1766113793Sdavidxu __cilkrts_worker_unlock(w); 1767108338Sjulian } 1768108338Sjulian 1769108338Sjulian // If we are about to do a random steal, we should have no 1770108338Sjulian // full frame... 1771108338Sjulian CILK_ASSERT(NULL == w->l->frame_ff); 1772108338Sjulian random_steal(w); 1773113793Sdavidxu } STOP_INTERVAL(w, INTERVAL_STEALING); 1774113793Sdavidxu 1775111115Sdavidxu // If the steal was successful, then the worker has populated its next 1776113793Sdavidxu // frame with the work to resume. 1777113793Sdavidxu ff = pop_next_frame(w); 1778107060Sdavidxu if (NULL == ff) { 1779113793Sdavidxu // Punish the worker for failing to steal. 1780111115Sdavidxu // No quantum for you! 1781111115Sdavidxu __cilkrts_yield(); 1782111115Sdavidxu w->l->steal_failure_count++; 1783111115Sdavidxu } else { 1784111115Sdavidxu // Reset steal_failure_count since there is obviously still work to 1785111129Sdavidxu // be done. 1786111129Sdavidxu w->l->steal_failure_count = 0; 1787111115Sdavidxu } 1788111115Sdavidxu } 1789111115Sdavidxu return ff; 1790111115Sdavidxu} 1791111115Sdavidxu 1792111115Sdavidxu/** 1793111115Sdavidxu * Keep stealing or looking on our queue. 1794111115Sdavidxu * 1795111115Sdavidxu * Returns either when a full frame is found, or NULL if the 1796111115Sdavidxu * computation is done. 1797111115Sdavidxu */ 1798111115Sdavidxustatic full_frame* search_until_work_found_or_done(__cilkrts_worker *w) 1799111115Sdavidxu{ 1800111115Sdavidxu full_frame *ff = NULL; 1801111115Sdavidxu // Find a full frame to execute (either through random stealing, 1802113793Sdavidxu // or because we pull it off w's 1-element queue). 1803111028Sjeff while (!ff) { 1804111028Sjeff // Check worker state to figure out our next action. 1805111028Sjeff switch (worker_runnable(w)) 1806111028Sjeff { 1807111028Sjeff case SCHEDULE_RUN: // One attempt at checking for work. 1808111028Sjeff ff = check_for_work(w); 1809108338Sjulian break; 1810111028Sjeff case SCHEDULE_WAIT: // go into wait-mode. 1811104695Sjulian CILK_ASSERT(WORKER_SYSTEM == w->l->type); 181299026Sjulian // If we are about to wait, then we better not have 181399026Sjulian // a frame that we should execute... 181499026Sjulian CILK_ASSERT(NULL == w->l->next_frame_ff); 181599026Sjulian notify_children_wait(w); 181699026Sjulian signal_node_wait(w->l->signal_node); 181799026Sjulian // ... 181899026Sjulian // Runtime is waking up. 181999026Sjulian notify_children_run(w); 182099026Sjulian w->l->steal_failure_count = 0; 182199026Sjulian break; 182299026Sjulian case SCHEDULE_EXIT: // exit the scheduler. 182399026Sjulian CILK_ASSERT(WORKER_USER != w->l->type); 182499026Sjulian return NULL; 182599026Sjulian default: 182699026Sjulian CILK_ASSERT(0); 182799026Sjulian abort(); 182899026Sjulian } 182999026Sjulian } 183099026Sjulian return ff; 183199026Sjulian} 183299026Sjulian 183399026Sjulian/** 183499026Sjulian * The proc method for a scheduling fiber on a user worker. 183599026Sjulian * 1836107719Sjulian * When a user worker jumps into the runtime, it jumps into this 183799026Sjulian * method by either starting it if the scheduling fiber has never run 183899026Sjulian * before, or resuming the fiber if it was previously suspended. 183999026Sjulian */ 1840116361SdavidxuCOMMON_PORTABLE 184199026Sjulianvoid scheduler_fiber_proc_for_user_worker(cilk_fiber *fiber) 184299026Sjulian{ 1843100648Sjulian __cilkrts_worker* w = cilk_fiber_get_owner(fiber); 1844100648Sjulian CILK_ASSERT(w); 184599026Sjulian 184699026Sjulian // This must be a user worker 1847108338Sjulian CILK_ASSERT(WORKER_USER == w->l->type); 184899026Sjulian 1849108338Sjulian // If we aren't the current worker, then something is very wrong 185099026Sjulian // here.. 1851102950Sdavidxu verify_current_wkr(w); 1852113920Sjhb 185399026Sjulian __cilkrts_run_scheduler_with_exceptions(w); 185499026Sjulian} 185599026Sjulian 185699026Sjulian 185799026Sjulian/** 1858113705Sdavidxu * The body of the runtime scheduling loop. This function executes in 1859103216Sjulian * 4 stages: 1860105911Sjulian * 1861105911Sjulian * 1. Transitions from the user code into the runtime by 1862103216Sjulian * executing any scheduling-stack functions. 1863105911Sjulian * 2. Looks for a full frame enqueued from a successful provably 1864105911Sjulian * good steal. 1865105911Sjulian * 3. If no full frame is found in step 2, steal until 1866105911Sjulian * a frame is found or we are done. If we are done, finish 1867105911Sjulian * the scheduling loop. 1868105911Sjulian * 4. When a frame is found, setup to resume user code. 1869105911Sjulian * In particular, suspend the current fiber and resume the 1870105911Sjulian * user fiber to execute the frame. 1871105911Sjulian * 1872105911Sjulian * Returns a fiber object that we should switch to after completing 1873105874Sdavidxu * the body of the loop, or NULL if we should continue executing on 1874111028Sjeff * this fiber. 1875111028Sjeff * 1876111028Sjeff * @pre @c current_fiber should equal @c wptr->l->scheduling_fiber 1877111028Sjeff * 1878111028Sjeff * @param current_fiber The currently executing (scheduling_ fiber 1879108338Sjulian * @param wptr The currently executing worker. 1880111028Sjeff * @param return The next fiber we should switch to. 1881105911Sjulian */ 188299026Sjulianstatic cilk_fiber* worker_scheduling_loop_body(cilk_fiber* current_fiber, 188399026Sjulian void* wptr) 188499026Sjulian{ 1885105911Sjulian __cilkrts_worker *w = (__cilkrts_worker*) wptr; 1886105911Sjulian CILK_ASSERT(current_fiber == w->l->scheduling_fiber); 1887105911Sjulian 1888113920Sjhb // Stage 1: Transition from executing user code to the runtime code. 1889105911Sjulian // We don't need to do this call here any more, because 1890105911Sjulian // every switch to the scheduling fiber should make this call 189199026Sjulian // using a post_switch_proc on the fiber. 189299026Sjulian // 1893100648Sjulian // enter_runtime_transition_proc(w->l->scheduling_fiber, wptr); 189499026Sjulian 1895103216Sjulian // After Stage 1 is complete, w should no longer have 1896113795Sdavidxu // an associated full frame. 189799026Sjulian CILK_ASSERT(NULL == w->l->frame_ff); 1898107719Sjulian 189999026Sjulian // Stage 2. First do a quick check of our 1-element queue. 190099026Sjulian full_frame *ff = pop_next_frame(w); 1901113795Sdavidxu 190299026Sjulian if (!ff) { 1903113920Sjhb // Stage 3. We didn't find anything from our 1-element 190499026Sjulian // queue. Now go through the steal loop to find work. 1905111028Sjeff ff = search_until_work_found_or_done(w); 1906113920Sjhb if (!ff) { 1907111028Sjeff CILK_ASSERT(w->g->work_done); 1908105854Sjulian return NULL; 1909111028Sjeff } 1910113920Sjhb } 191199026Sjulian 191299026Sjulian // Stage 4. Now that we have found a full frame to work on, 191399026Sjulian // actually execute it. 191499026Sjulian __cilkrts_stack_frame *sf; 191599026Sjulian 191699026Sjulian // There shouldn't be any uncaught exceptions. 191799026Sjulian // 191899026Sjulian // In Windows, the OS catches any exceptions not caught by the 191999026Sjulian // user code. Thus, we are omitting the check on Windows. 192099026Sjulian // 192199026Sjulian // On Android, calling std::uncaught_exception with the stlport 192299026Sjulian // library causes a seg fault. Since we're not supporting 192399026Sjulian // exceptions there at this point, just don't do the check 192499026Sjulian CILKBUG_ASSERT_NO_UNCAUGHT_EXCEPTION(); 192599026Sjulian 192699026Sjulian BEGIN_WITH_WORKER_LOCK(w) { 192799026Sjulian CILK_ASSERT(!w->l->frame_ff); 192899026Sjulian BEGIN_WITH_FRAME_LOCK(w, ff) { 192999026Sjulian sf = ff->call_stack; 193099026Sjulian CILK_ASSERT(sf && !sf->call_parent); 193199026Sjulian setup_for_execution(w, ff, 0); 193299026Sjulian } END_WITH_FRAME_LOCK(w, ff); 193399026Sjulian } END_WITH_WORKER_LOCK(w); 193499026Sjulian 193599026Sjulian /* run it */ 193699026Sjulian // 193799026Sjulian // Prepare to run the full frame. To do so, we need to: 193899026Sjulian // (a) Execute some code on this fiber (the scheduling 193999026Sjulian // fiber) to set up data structures, and 194099026Sjulian // (b) Suspend the scheduling fiber, and resume the 194199026Sjulian // user-code fiber. 194299026Sjulian 194399026Sjulian // Part (a). Set up data structures. 194499026Sjulian scheduling_fiber_prepare_to_resume_user_code(w, ff, sf); 194599026Sjulian 194699026Sjulian cilk_fiber *other = w->l->frame_ff->fiber_self; 194799026Sjulian cilk_fiber_data* other_data = cilk_fiber_get_data(other); 194899026Sjulian cilk_fiber_data* current_fiber_data = cilk_fiber_get_data(current_fiber); 194999026Sjulian 1950104502Sjmallett // I believe two cases are possible here, both of which 1951104502Sjmallett // should have other_data->resume_sf as NULL. 195299026Sjulian // 195399026Sjulian // 1. Resuming a fiber that was previously executing 195499026Sjulian // user code (i.e., a provably-good-steal). 195599026Sjulian // In this case, resume_sf should have been 195699026Sjulian // set to NULL when it was suspended. 1957102950Sdavidxu // 195899026Sjulian // 2. Resuming code on a steal. In this case, since we 195999026Sjulian // grabbed a new fiber, resume_sf should be NULL. 196099026Sjulian CILK_ASSERT(NULL == other_data->resume_sf); 1961100648Sjulian 1962100648Sjulian#if FIBER_DEBUG >= 2 1963100646Sjulian fprintf(stderr, "W=%d: other fiber=%p, setting resume_sf to %p\n", 1964100646Sjulian w->self, other, other_data->resume_sf); 196599026Sjulian#endif 1966100648Sjulian // Update our own fiber's data. 196799026Sjulian current_fiber_data->resume_sf = NULL; 196899026Sjulian // The scheduling fiber should have the right owner from before. 1969100648Sjulian CILK_ASSERT(current_fiber_data->owner == w); 197099026Sjulian other_data->resume_sf = sf; 197199026Sjulian 1972112071Sdavidxu 1973112071Sdavidxu#if FIBER_DEBUG >= 3 197499026Sjulian fprintf(stderr, "ThreadId=%p (about to suspend self resume other), W=%d: current_fiber=%p, other=%p, current_fiber->resume_sf = %p, other->resume_sf = %p\n", 197599026Sjulian cilkos_get_current_thread_id(), 197699026Sjulian w->self, 1977102950Sdavidxu current_fiber, other, 197899026Sjulian current_fiber_data->resume_sf, 197999026Sjulian other_data->resume_sf); 198099026Sjulian#endif 198199026Sjulian return other; 1982116361Sdavidxu} 1983112910Sjeff 1984112910Sjeff 1985112910Sjeff/** 198699026Sjulian * This function is executed once by each worker, to initialize its 198799026Sjulian * scheduling loop. 198899026Sjulian */ 198999026Sjulianstatic void worker_scheduler_init_function(__cilkrts_worker *w) 199099026Sjulian{ 199199026Sjulian // First, execute the startup tasks that must happen for all 199299026Sjulian // worker types. 1993103216Sjulian ITT_SYNC_PREPARE(w); 1994102950Sdavidxu /* Notify tools about the new worker. Inspector needs this, but we 1995100632Sjulian don't want to confuse Cilkscreen with system threads. User threads 1996103216Sjulian do this notification in bind_thread */ 1997100632Sjulian if (! w->g->under_ptool) 1998100632Sjulian __cilkrts_cilkscreen_establish_worker(w); 1999114398Sdavidxu 2000113864Sjhb // Seed the initial random number generator. 2001100594Sjulian // If we forget to do this, then the worker always steals from 0. 200299026Sjulian // Programs will still execute correctly, but 200399026Sjulian // you may see a subtle performance bug... 2004114398Sdavidxu mysrand(w, (w->self + 1)); 200599026Sjulian 200699026Sjulian // The startup work varies, depending on the worker type. 200799026Sjulian switch (w->l->type) { 200899026Sjulian case WORKER_USER: 200999026Sjulian // Stop working once we've entered the scheduler. 2010102898Sdavidxu // For user workers, INTERVAL_IN_SCHEDULER counts the time 2011102898Sdavidxu // since we called bind_thread. 2012102898Sdavidxu break; 2013102898Sdavidxu 2014102898Sdavidxu case WORKER_SYSTEM: 2015102898Sdavidxu // If a system worker is starting, we must also be starting 2016113920Sjhb // the runtime. 2017112071Sdavidxu 2018102898Sdavidxu // Runtime begins in a wait-state and is woken up by the first user 2019103216Sjulian // worker when the runtime is ready. 2020102898Sdavidxu signal_node_wait(w->l->signal_node); 2021103216Sjulian // ... 2022103216Sjulian // Runtime is waking up. 2023103216Sjulian notify_children_run(w); 2024103216Sjulian w->l->steal_failure_count = 0; 2025105911Sjulian 2026103216Sjulian // For system threads, count all the time this thread is 2027103216Sjulian // alive in the scheduling loop. 2028103216Sjulian START_INTERVAL(w, INTERVAL_IN_SCHEDULER); 2029102898Sdavidxu START_INTERVAL(w, INTERVAL_WORKING); 2030102898Sdavidxu break; 2031102898Sdavidxu default: 2032102898Sdavidxu __cilkrts_bug("Unknown worker %p of type %d entering scheduling loop\n", 2033102898Sdavidxu w, w->l->type); 2034102898Sdavidxu } 2035102898Sdavidxu} 2036102898Sdavidxu 2037113920Sjhb/** 2038102898Sdavidxu * This function is executed once by each worker, to finish its 2039103216Sjulian * scheduling loop. 2040102898Sdavidxu * 2041103216Sjulian * @note Currently, only system workers finish their loops. User 2042102898Sdavidxu * workers will jump away to user code without exiting their 2043102898Sdavidxu * scheduling loop. 204499026Sjulian */ 204599026Sjulianstatic void worker_scheduler_terminate_function(__cilkrts_worker *w) 204699026Sjulian{ 204799026Sjulian // A user worker should never finish by falling through the 204899026Sjulian // scheduling loop. 204999026Sjulian CILK_ASSERT(WORKER_USER != w->l->type); 205099026Sjulian STOP_INTERVAL(w, INTERVAL_IN_RUNTIME); 205199026Sjulian STOP_INTERVAL(w, INTERVAL_IN_SCHEDULER); 2052100646Sjulian} 205399026Sjulian 205499026Sjulian/** 205599026Sjulian * The main scheduler function executed by a worker's scheduling 2056102898Sdavidxu * fiber. 205799026Sjulian * 2058102950Sdavidxu * This method is started by either a new system worker, or a user 205999026Sjulian * worker that has stalled and just been imported into the runtime. 206099026Sjulian */ 206199026Sjulianstatic void worker_scheduler_function(__cilkrts_worker *w) 206299026Sjulian{ 206399026Sjulian worker_scheduler_init_function(w); 206499026Sjulian 2065102898Sdavidxu // The main scheduling loop body. 206699026Sjulian 206799026Sjulian while (!w->g->work_done) { 206899026Sjulian // Set intervals. Now we are in the runtime instead of working. 206999026Sjulian START_INTERVAL(w, INTERVAL_IN_RUNTIME); 207099026Sjulian STOP_INTERVAL(w, INTERVAL_WORKING); 207199026Sjulian 207299026Sjulian // Execute the "body" of the scheduling loop, and figure 207399026Sjulian // out the fiber to jump to next. 207499026Sjulian cilk_fiber* fiber_to_resume 207599026Sjulian = worker_scheduling_loop_body(w->l->scheduling_fiber, w); 207699026Sjulian 207799026Sjulian if (fiber_to_resume) { 2078102950Sdavidxu // Suspend the current fiber and resume next one. 2079113920Sjhb NOTE_INTERVAL(w, INTERVAL_SUSPEND_RESUME_OTHER); 208099026Sjulian STOP_INTERVAL(w, INTERVAL_IN_RUNTIME); 2081102292Sjulian START_INTERVAL(w, INTERVAL_WORKING); 2082102292Sjulian cilk_fiber_suspend_self_and_resume_other(w->l->scheduling_fiber, 2083102292Sjulian fiber_to_resume); 2084102292Sjulian 2085102292Sjulian // Return here only when this (scheduling) fiber is 2086102292Sjulian // resumed (i.e., this worker wants to reenter the runtime). 2087102292Sjulian } 2088102292Sjulian } 2089103216Sjulian 2090102292Sjulian // Finish the scheduling loop. 2091102292Sjulian worker_scheduler_terminate_function(w); 2092113920Sjhb} 209399026Sjulian 209499026Sjulian 2095102292Sjulian/************************************************************* 2096 Forward declarations for reduction protocol. 2097*************************************************************/ 2098 2099static __cilkrts_worker* 2100execute_reductions_for_sync(__cilkrts_worker *w, 2101 full_frame *ff, 2102 __cilkrts_stack_frame *sf_at_sync); 2103 2104static __cilkrts_worker* 2105execute_reductions_for_spawn_return(__cilkrts_worker *w, 2106 full_frame *ff, 2107 __cilkrts_stack_frame *returning_sf); 2108 2109 2110 2111/************************************************************* 2112 Scheduler functions that are callable by client code 2113*************************************************************/ 2114static full_frame *disown(__cilkrts_worker *w, 2115 full_frame *ff, 2116 __cilkrts_stack_frame *sf, 2117 const char *why) 2118{ 2119 CILK_ASSERT(ff); 2120 make_unrunnable(w, ff, sf, sf != 0, why); 2121 w->l->frame_ff = 0; 2122 return ff->parent; 2123} 2124 2125/** 2126 * Called when ff is returning from a spawn, and we need to execute a 2127 * reduction. 2128 * 2129 * @param w The currently executing worker. 2130 * @param ff The full frame for w. 2131 * @param returning_sf The stack frame for the spawn helper that is returning. 2132 * 2133 * Normally, by the time we gain control in the runtime, the worker 2134 * has already popped off the __cilkrts_stack_frame "returning_sf" 2135 * from its call chain. 2136 * 2137 * When we have only serial reductions, w->current_stack_frame is not 2138 * needed any more, because w is about to enter the runtime scheduling 2139 * loop anyway. Similarly, the frame "ff" is slated to be destroyed 2140 * after the runtime finishes the return from spawn and splices ff out 2141 * of the tree of full frames. 2142 * 2143 * To execute a parallel reduction, however, we still want 2144 * w->current_stack_frame == returning_sf, and we are going to use the 2145 * frame ff for a little bit longer. 2146 * 2147 * This method: 2148 * 2149 * 1. Puts returning_sf back as w's current stack frame. 2150 * 2. Makes "ff" runnable again on w. 2151 */ 2152static inline 2153void restore_frame_for_spawn_return_reduction(__cilkrts_worker *w, 2154 full_frame *ff, 2155 __cilkrts_stack_frame *returning_sf) { 2156#if REDPAR_DEBUG >= 2 2157 CILK_ASSERT(returning_sf); 2158 CILK_ASSERT(returning_sf->worker == w); 2159#endif 2160 // Change w's current stack frame back to "returning_sf". 2161 // 2162 // Intuitively, w->current_stack_frame should be 2163 // returning_sf->call_parent at this point. 2164 // 2165 // We can not assert this, however, because the pop of 2166 // returning_sf from the call chain has already cleared 2167 // returning_sf->call_parent. We don't want to restore the call 2168 // parent of returning_sf, because its parent has been stolen, and 2169 // the runtime assumes that steals break this link. 2170 2171 // We cannot assert call_parent is NULL either, since that's not true for 2172 // Win64 exception handling 2173// CILK_ASSERT(returning_sf->call_parent == NULL); 2174 w->current_stack_frame = returning_sf; 2175 2176 // Make the full frame "ff" runnable again, in preparation for 2177 // executing the reduction. 2178 make_runnable(w, ff); 2179} 2180 2181 2182NORETURN __cilkrts_c_sync(__cilkrts_worker *w, 2183 __cilkrts_stack_frame *sf_at_sync) 2184{ 2185 full_frame *ff; 2186 2187 // Claim: This read of w->l->frame_ff can occur without 2188 // holding the worker lock because when w has reached a sync 2189 // and entered the runtime (because it stalls), w's deque is empty 2190 // and no one else can steal and change w->l->frame_ff. 2191 2192 ff = w->l->frame_ff; 2193#ifdef _WIN32 2194 __cilkrts_save_exception_state(w, ff); 2195#else 2196 // Move any pending exceptions into the full frame 2197 CILK_ASSERT(NULL == ff->pending_exception); 2198 ff->pending_exception = w->l->pending_exception; 2199 w->l->pending_exception = NULL; 2200#endif 2201 2202 w = execute_reductions_for_sync(w, ff, sf_at_sync); 2203 2204#if FIBER_DEBUG >= 3 2205 fprintf(stderr, "ThreadId=%p, w->self = %d. about to longjmp_into_runtim[c_sync] with ff=%p\n", 2206 cilkos_get_current_thread_id(), w->self, ff); 2207#endif 2208 2209 longjmp_into_runtime(w, do_sync, sf_at_sync); 2210} 2211 2212static void do_sync(__cilkrts_worker *w, full_frame *ff, 2213 __cilkrts_stack_frame *sf) 2214{ 2215 //int abandoned = 1; 2216 enum provably_good_steal_t steal_result = ABANDON_EXECUTION; 2217 2218 START_INTERVAL(w, INTERVAL_SYNC_CHECK) { 2219 BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) { 2220 2221 CILK_ASSERT(ff); 2222 BEGIN_WITH_FRAME_LOCK(w, ff) { 2223 CILK_ASSERT(sf->call_parent == 0); 2224 CILK_ASSERT(sf->flags & CILK_FRAME_UNSYNCHED); 2225 2226 // Before switching into the scheduling fiber, we should have 2227 // already taken care of deallocating the current 2228 // fiber. 2229 CILK_ASSERT(NULL == ff->fiber_self); 2230 2231 // Update the frame's pedigree information if this is an ABI 1 2232 // or later frame 2233 if (CILK_FRAME_VERSION_VALUE(sf->flags) >= 1) 2234 { 2235 sf->parent_pedigree.rank = w->pedigree.rank; 2236 sf->parent_pedigree.parent = w->pedigree.parent; 2237 2238 // Note that the pedigree rank needs to be updated 2239 // when setup_for_execution_pedigree runs 2240 sf->flags |= CILK_FRAME_SF_PEDIGREE_UNSYNCHED; 2241 } 2242 2243 /* the decjoin() occurs in provably_good_steal() */ 2244 steal_result = provably_good_steal(w, ff); 2245 2246 } END_WITH_FRAME_LOCK(w, ff); 2247 // set w->l->frame_ff = NULL after checking abandoned 2248 if (WAIT_FOR_CONTINUE != steal_result) { 2249 w->l->frame_ff = NULL; 2250 } 2251 } END_WITH_WORKER_LOCK_OPTIONAL(w); 2252 } STOP_INTERVAL(w, INTERVAL_SYNC_CHECK); 2253 2254 // Now, if we are in a replay situation and provably_good_steal() returned 2255 // WAIT_FOR_CONTINUE, we should sleep, reacquire locks, call 2256 // provably_good_steal(), and release locks until we get a value other 2257 // than WAIT_FOR_CONTINUE from the function. 2258#ifdef CILK_RECORD_REPLAY 2259 // We don't have to explicitly check for REPLAY_LOG below because 2260 // steal_result can only be set to WAIT_FOR_CONTINUE during replay 2261 while(WAIT_FOR_CONTINUE == steal_result) 2262 { 2263 __cilkrts_sleep(); 2264 BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) 2265 { 2266 ff = w->l->frame_ff; 2267 BEGIN_WITH_FRAME_LOCK(w, ff) 2268 { 2269 steal_result = provably_good_steal(w, ff); 2270 } END_WITH_FRAME_LOCK(w, ff); 2271 if (WAIT_FOR_CONTINUE != steal_result) 2272 w->l->frame_ff = NULL; 2273 } END_WITH_WORKER_LOCK_OPTIONAL(w); 2274 } 2275#endif // CILK_RECORD_REPLAY 2276 2277#ifdef ENABLE_NOTIFY_ZC_INTRINSIC 2278 // If we can't make any further progress on this thread, tell Inspector 2279 // that we're abandoning the work and will go find something else to do. 2280 if (ABANDON_EXECUTION == steal_result) 2281 { 2282 NOTIFY_ZC_INTRINSIC("cilk_sync_abandon", 0); 2283 } 2284#endif // defined ENABLE_NOTIFY_ZC_INTRINSIC 2285 2286 return; /* back to scheduler loop */ 2287} 2288 2289/* worker W completely promotes its own deque, simulating the case 2290 where the whole deque is stolen. We use this mechanism to force 2291 the allocation of new storage for reducers for race-detection 2292 purposes. */ 2293void __cilkrts_promote_own_deque(__cilkrts_worker *w) 2294{ 2295 // Remember the fiber we start this method on. 2296 CILK_ASSERT(w->l->frame_ff); 2297 cilk_fiber* starting_fiber = w->l->frame_ff->fiber_self; 2298 2299 BEGIN_WITH_WORKER_LOCK(w) { 2300 while (dekker_protocol(w)) { 2301 /* PLACEHOLDER_FIBER is used as non-null marker to tell detach() 2302 and make_child() that this frame should be treated as a spawn 2303 parent, even though we have not assigned it a stack. */ 2304 detach_for_steal(w, w, PLACEHOLDER_FIBER); 2305 } 2306 } END_WITH_WORKER_LOCK(w); 2307 2308 2309 // TBD: The management of full frames and fibers is a bit 2310 // sketchy here. We are promoting stack frames into full frames, 2311 // and pretending they are stolen away, but no other worker is 2312 // actually working on them. Some runtime invariants 2313 // may be broken here. 2314 // 2315 // Technically, if we are simulating a steal from w 2316 // w should get a new full frame, but 2317 // keep the same fiber. A real thief would be taking the 2318 // loot frame away, get a new fiber, and starting executing the 2319 // loot frame. 2320 // 2321 // What should a fake thief do? Where does the frame go? 2322 2323 // In any case, we should be finishing the promotion process with 2324 // the same fiber with. 2325 CILK_ASSERT(w->l->frame_ff); 2326 CILK_ASSERT(w->l->frame_ff->fiber_self == starting_fiber); 2327} 2328 2329 2330 2331/* the client code calls this function after a spawn when the dekker 2332 protocol fails. The function may either return or longjmp 2333 into the rts 2334 2335 This function takes in a "returning_sf" argument which corresponds 2336 to the __cilkrts_stack_frame that we are finishing (i.e., the 2337 argument to __cilkrts_leave_frame). 2338 */ 2339void __cilkrts_c_THE_exception_check(__cilkrts_worker *w, 2340 __cilkrts_stack_frame *returning_sf) 2341{ 2342 full_frame *ff; 2343 int stolen_p; 2344 __cilkrts_stack_frame *saved_sf = NULL; 2345 2346 START_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK); 2347 2348 BEGIN_WITH_WORKER_LOCK(w) { 2349 ff = w->l->frame_ff; 2350 CILK_ASSERT(ff); 2351 /* This code is called only upon a normal return and never 2352 upon an exceptional return. Assert that this is the 2353 case. */ 2354 CILK_ASSERT(!w->l->pending_exception); 2355 2356 reset_THE_exception(w); 2357 stolen_p = !(w->head < (w->tail + 1)); /* +1 because tail was 2358 speculatively 2359 decremented by the 2360 compiled code */ 2361 2362 if (stolen_p) { 2363 /* XXX This will be charged to THE for accounting purposes */ 2364 __cilkrts_save_exception_state(w, ff); 2365 2366 // Save the value of the current stack frame. 2367 saved_sf = w->current_stack_frame; 2368 2369 // Reverse the decrement from undo_detach. 2370 // This update effectively resets the deque to be 2371 // empty (i.e., changes w->tail back to equal w->head). 2372 // We need to reset the deque to execute parallel 2373 // reductions. When we have only serial reductions, it 2374 // does not matter, since serial reductions do not 2375 // change the deque. 2376 w->tail++; 2377#if REDPAR_DEBUG > 1 2378 // ASSERT our deque is empty. 2379 CILK_ASSERT(w->head == w->tail); 2380#endif 2381 } 2382 } END_WITH_WORKER_LOCK(w); 2383 2384 STOP_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK); 2385 2386 if (stolen_p) 2387 { 2388 w = execute_reductions_for_spawn_return(w, ff, returning_sf); 2389 2390 // "Mr. Policeman? My parent always told me that if I was in trouble 2391 // I should ask a nice policeman for help. I can't find my parent 2392 // anywhere..." 2393 // 2394 // Write a record to the replay log for an attempt to return to a stolen parent 2395 replay_record_orphaned(w); 2396 2397 // Update the pedigree only after we've finished the 2398 // reductions. 2399 update_pedigree_on_leave_frame(w, returning_sf); 2400 2401 // Notify Inspector that the parent has been stolen and we're 2402 // going to abandon this work and go do something else. This 2403 // will match the cilk_leave_begin in the compiled code 2404 NOTIFY_ZC_INTRINSIC("cilk_leave_stolen", saved_sf); 2405 2406 DBGPRINTF ("%d: longjmp_into_runtime from __cilkrts_c_THE_exception_check\n", w->self); 2407 longjmp_into_runtime(w, do_return_from_spawn, 0); 2408 DBGPRINTF ("%d: returned from longjmp_into_runtime from __cilkrts_c_THE_exception_check?!\n", w->self); 2409 } 2410 else 2411 { 2412 NOTE_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK_USELESS); 2413 return; 2414 } 2415} 2416 2417/* Return an exception to a stolen parent. */ 2418NORETURN __cilkrts_exception_from_spawn(__cilkrts_worker *w, 2419 __cilkrts_stack_frame *returning_sf) 2420{ 2421 full_frame *ff = w->l->frame_ff; 2422 // This is almost the same as THE_exception_check, except 2423 // the detach didn't happen, we don't need to undo the tail 2424 // update. 2425 CILK_ASSERT(w->head == w->tail); 2426 w = execute_reductions_for_spawn_return(w, ff, returning_sf); 2427 2428 longjmp_into_runtime(w, do_return_from_spawn, 0); 2429 CILK_ASSERT(0); 2430} 2431 2432static void do_return_from_spawn(__cilkrts_worker *w, 2433 full_frame *ff, 2434 __cilkrts_stack_frame *sf) 2435{ 2436 full_frame *parent_ff; 2437 enum provably_good_steal_t steal_result = ABANDON_EXECUTION; 2438 2439 BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) { 2440 CILK_ASSERT(ff); 2441 CILK_ASSERT(!ff->is_call_child); 2442 CILK_ASSERT(sf == NULL); 2443 parent_ff = ff->parent; 2444 2445 BEGIN_WITH_FRAME_LOCK(w, ff) { 2446 decjoin(ff); 2447 } END_WITH_FRAME_LOCK(w, ff); 2448 2449 BEGIN_WITH_FRAME_LOCK(w, parent_ff) { 2450 if (parent_ff->simulated_stolen) 2451 unconditional_steal(w, parent_ff); 2452 else 2453 steal_result = provably_good_steal(w, parent_ff); 2454 } END_WITH_FRAME_LOCK(w, parent_ff); 2455 2456 } END_WITH_WORKER_LOCK_OPTIONAL(w); 2457 2458 // Loop here in replay mode 2459#ifdef CILK_RECORD_REPLAY 2460 // We don't have to explicitly check for REPLAY_LOG below because 2461 // steal_result can only get set to WAIT_FOR_CONTINUE during replay. 2462 // We also don't have to worry about the simulated_stolen flag 2463 // because steal_result can only be set to WAIT_FOR_CONTINUE by 2464 // provably_good_steal(). 2465 while(WAIT_FOR_CONTINUE == steal_result) 2466 { 2467 __cilkrts_sleep(); 2468 BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) 2469 { 2470 BEGIN_WITH_FRAME_LOCK(w, parent_ff) 2471 { 2472 steal_result = provably_good_steal(w, parent_ff); 2473 } END_WITH_FRAME_LOCK(w, parent_ff); 2474 } END_WITH_WORKER_LOCK_OPTIONAL(w); 2475 } 2476#endif // CILK_RECORD_REPLAY 2477 2478 // Cleanup the child frame. 2479 __cilkrts_destroy_full_frame(w, ff); 2480 return; 2481} 2482 2483#ifdef _WIN32 2484/* migrate an exception across fibers. Call this function when an exception has 2485 * been thrown and has to traverse across a steal. The exception has already 2486 * been wrapped up, so all that remains is to longjmp() into the continuation, 2487 * sync, and re-raise it. 2488 */ 2489void __cilkrts_migrate_exception(__cilkrts_stack_frame *sf) { 2490 2491 __cilkrts_worker *w = sf->worker; 2492 full_frame *ff; 2493 2494 BEGIN_WITH_WORKER_LOCK(w) { 2495 ff = w->l->frame_ff; 2496 reset_THE_exception(w); 2497 /* there is no need to check for a steal because we wouldn't be here if 2498 there weren't a steal. */ 2499 __cilkrts_save_exception_state(w, ff); 2500 2501 CILK_ASSERT(w->head == w->tail); 2502 } END_WITH_WORKER_LOCK(w); 2503 2504 { 2505 // TBD(jsukha): This function emulates the 2506 // the "do_return_from_spawn" path. 2507 w = execute_reductions_for_spawn_return(w, ff, sf); 2508 } 2509 2510 longjmp_into_runtime(w, do_return_from_spawn, 0); /* does not return. */ 2511 CILK_ASSERT(! "Shouldn't be here..."); 2512} 2513#endif 2514 2515 2516/* Pop a call stack from TAIL. Return the call stack, or NULL if the 2517 queue is empty */ 2518__cilkrts_stack_frame *__cilkrts_pop_tail(__cilkrts_worker *w) 2519{ 2520 __cilkrts_stack_frame *sf; 2521 BEGIN_WITH_WORKER_LOCK(w) { 2522 __cilkrts_stack_frame *volatile *tail = w->tail; 2523 if (w->head < tail) { 2524 --tail; 2525 sf = *tail; 2526 w->tail = tail; 2527 } else { 2528 sf = 0; 2529 } 2530 } END_WITH_WORKER_LOCK(w); 2531 return sf; 2532} 2533 2534#ifdef CILK_RECORD_REPLAY 2535__cilkrts_stack_frame *simulate_pop_tail(__cilkrts_worker *w) 2536{ 2537 __cilkrts_stack_frame *sf; 2538 BEGIN_WITH_WORKER_LOCK(w) { 2539 if (w->head < w->tail) { 2540 sf = *(w->tail-1); 2541 } else { 2542 sf = 0; 2543 } 2544 } END_WITH_WORKER_LOCK(w); 2545 return sf; 2546} 2547#endif 2548 2549 2550/* Return from a call, not a spawn. */ 2551void __cilkrts_return(__cilkrts_worker *w) 2552{ 2553 full_frame *ff, *parent_ff; 2554 START_INTERVAL(w, INTERVAL_RETURNING); 2555 2556 BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) { 2557 ff = w->l->frame_ff; 2558 CILK_ASSERT(ff); 2559 CILK_ASSERT(ff->join_counter == 1); 2560 /* This path is not used to return from spawn. */ 2561 CILK_ASSERT(ff->is_call_child); 2562 2563 BEGIN_WITH_FRAME_LOCK(w, ff) { 2564 // After this call, w->l->frame_ff != ff. 2565 // Technically, w will "own" ff until ff is freed, 2566 // however, because ff is a dying leaf full frame. 2567 parent_ff = disown(w, ff, 0, "return"); 2568 decjoin(ff); 2569 2570#ifdef _WIN32 2571 __cilkrts_save_exception_state(w, ff); 2572#else 2573 // Move the pending exceptions into the full frame 2574 // This should always be NULL if this isn't a 2575 // return with an exception 2576 CILK_ASSERT(NULL == ff->pending_exception); 2577 ff->pending_exception = w->l->pending_exception; 2578 w->l->pending_exception = NULL; 2579#endif // _WIN32 2580 2581 } END_WITH_FRAME_LOCK(w, ff); 2582 2583 __cilkrts_fence(); /* redundant */ 2584 2585 CILK_ASSERT(parent_ff); 2586 2587 BEGIN_WITH_FRAME_LOCK(w, parent_ff) { 2588 finalize_child_for_call(w, parent_ff, ff); 2589 } END_WITH_FRAME_LOCK(w, parent_ff); 2590 2591 ff = pop_next_frame(w); 2592 /* ff will be non-null except when the parent frame is owned 2593 by another worker. 2594 CILK_ASSERT(ff) 2595 */ 2596 CILK_ASSERT(!w->l->frame_ff); 2597 if (ff) { 2598 BEGIN_WITH_FRAME_LOCK(w, ff) { 2599 __cilkrts_stack_frame *sf = ff->call_stack; 2600 CILK_ASSERT(sf && !sf->call_parent); 2601 setup_for_execution(w, ff, 1); 2602 } END_WITH_FRAME_LOCK(w, ff); 2603 } 2604 } END_WITH_WORKER_LOCK_OPTIONAL(w); 2605 2606 STOP_INTERVAL(w, INTERVAL_RETURNING); 2607} 2608 2609static void __cilkrts_unbind_thread() 2610{ 2611 int stop_cilkscreen = 0; 2612 global_state_t *g; 2613 2614 // Take out the global OS mutex to protect accesses to the table of workers 2615 global_os_mutex_lock(); 2616 2617 if (cilkg_is_published()) { 2618 __cilkrts_worker *w = __cilkrts_get_tls_worker(); 2619 if (w) { 2620 g = w->g; 2621 2622 // If there's only 1 worker, the counts will be stopped in 2623 // __cilkrts_scheduler 2624 if (g->P > 1) 2625 { 2626 STOP_INTERVAL(w, INTERVAL_WORKING); 2627 STOP_INTERVAL(w, INTERVAL_IN_SCHEDULER); 2628 } 2629 2630 __cilkrts_set_tls_worker(0); 2631 2632 if (w->self == -1) { 2633 // This worker is an overflow worker. I.e., it was created on- 2634 // demand when the global pool ran out of workers. 2635 destroy_worker(w); 2636 __cilkrts_free(w); 2637 } else { 2638 // This is a normal user worker and needs to be counted by the 2639 // global state for the purposes of throttling system workers. 2640 w->l->type = WORKER_FREE; 2641 __cilkrts_leave_cilk(g); 2642 } 2643 2644 stop_cilkscreen = (0 == g->Q); 2645 } 2646 } 2647 global_os_mutex_unlock(); 2648 2649 /* Turn off Cilkscreen. This needs to be done when we are NOT holding the 2650 * os mutex. */ 2651 if (stop_cilkscreen) 2652 __cilkrts_cilkscreen_disable_instrumentation(); 2653} 2654 2655/* special return from the initial frame */ 2656 2657void __cilkrts_c_return_from_initial(__cilkrts_worker *w) 2658{ 2659 struct cilkred_map *rm; 2660 2661 /* This is only called on a user thread worker. */ 2662 CILK_ASSERT(w->l->type == WORKER_USER); 2663 2664 #if REDPAR_DEBUG >= 3 2665 fprintf(stderr, "[W=%d, desc=cilkrts_c_return_from_initial, ff=%p]\n", 2666 w->self, w->l->frame_ff); 2667 #endif 2668 2669 BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) { 2670 full_frame *ff = w->l->frame_ff; 2671 CILK_ASSERT(ff); 2672 CILK_ASSERT(ff->join_counter == 1); 2673 w->l->frame_ff = 0; 2674 2675 CILK_ASSERT(ff->fiber_self); 2676 // Save any TBB interop data for the next time this thread enters Cilk 2677 cilk_fiber_tbb_interop_save_info_from_stack(ff->fiber_self); 2678 2679 // Deallocate cilk_fiber that mapped to the user stack. The stack 2680 // itself does not get deallocated (of course) but our data 2681 // structure becomes divorced from it. 2682 2683#if FIBER_DEBUG >= 1 2684 fprintf(stderr, "ThreadId=%p: w=%d: We are about to deallocate ff->fiber_self = %p here. w->l->scheduling_fiber = %p. w->l->type = %d\n", 2685 cilkos_get_current_thread_id(), 2686 w->self, 2687 ff->fiber_self, 2688 w->l->scheduling_fiber, 2689 w->l->type); 2690#endif 2691 // The fiber in ff is a user-code fiber. The fiber in 2692 // w->l->scheduling_fiber is a scheduling fiber. These fibers should 2693 // never be equal. When a user worker returns (and will unbind), we 2694 // should destroy only the fiber in ff. The scheduling fiber will be 2695 // re-used. 2696 2697 CILK_ASSERT(ff->fiber_self != w->l->scheduling_fiber); 2698 2699 START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) { 2700 // This fiber might not be deallocated here if there 2701 // is a pending exception on Windows that refers 2702 // to this fiber. 2703 // 2704 // First "suspend" the fiber, and then try to delete it. 2705 cilk_fiber_deallocate_from_thread(ff->fiber_self); 2706 } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE); 2707 ff->fiber_self = NULL; 2708 2709 /* Save reducer map into global_state object */ 2710 rm = w->reducer_map; 2711 w->reducer_map = NULL; 2712 2713#if REDPAR_DEBUG >= 3 2714 fprintf(stderr, "W=%d, reducer_map_to_delete=%p, was in ff=%p\n", 2715 w->self, 2716 rm, 2717 ff); 2718#endif 2719 __cilkrts_destroy_full_frame(w, ff); 2720 2721 2722 /* Work is never done. w->g->work_done = 1; __cilkrts_fence(); */ 2723 } END_WITH_WORKER_LOCK_OPTIONAL(w); 2724 2725 2726 save_pedigree_leaf_from_user_worker(w); 2727 2728 // Workers can have NULL reducer maps now. 2729 if (rm) { 2730 __cilkrts_destroy_reducer_map(w, rm); 2731 } 2732 2733 2734#if FIBER_DEBUG >= 1 2735 __cilkrts_worker* tmp = w; 2736 int tmp_id = w->self; 2737 fprintf(stderr, "w=%d: We are about unbind thread (w= %p)\n", 2738 w->self, 2739 w); 2740#endif 2741 2742 w = NULL; 2743 2744 __cilkrts_unbind_thread(); 2745 2746#if FIBER_DEBUG >= 1 2747 2748 fprintf(stderr, "w=%p, %d: Finished unbind\n", 2749 tmp, tmp_id); 2750#endif 2751 2752 /* Other workers will stop trying to steal if this was the last worker. */ 2753 2754 return; 2755} 2756 2757 2758/* 2759 * __cilkrts_restore_stealing 2760 * 2761 * Restore the protected_tail to a previous state, possibly allowing frames 2762 * to be stolen. The dekker_protocol has been extended to steal only if 2763 * head+1 is < protected_tail. 2764 */ 2765 2766void __cilkrts_restore_stealing( 2767 __cilkrts_worker *w, 2768 __cilkrts_stack_frame *volatile *saved_protected_tail) 2769{ 2770 /* On most x86 this pair of operations would be slightly faster 2771 as an atomic exchange due to the implicit memory barrier in 2772 an atomic instruction. */ 2773 w->protected_tail = saved_protected_tail; 2774 __cilkrts_fence(); 2775} 2776 2777/* 2778 * __cilkrts_disallow_stealing 2779 * 2780 * Move the protected_tail to NEW_PROTECTED_TAIL, preventing any 2781 * frames from being stolen. If NEW_PROTECTED_TAIL is NULL, prevent 2782 * stealing from the whole queue. The dekker_protocol has been 2783 * extended to only steal if head+1 is also < protected_tail. 2784 */ 2785 2786__cilkrts_stack_frame *volatile *__cilkrts_disallow_stealing( 2787 __cilkrts_worker *w, 2788 __cilkrts_stack_frame *volatile *new_protected_tail) 2789{ 2790 __cilkrts_stack_frame *volatile *saved_protected_tail = w->protected_tail; 2791 2792 if (!new_protected_tail) 2793 new_protected_tail = w->l->ltq; 2794 2795 if (w->protected_tail > new_protected_tail) { 2796 w->protected_tail = new_protected_tail; 2797 /* Issue a store-store barrier. The update to protected_tail 2798 here must precede the update to tail in the next spawn. 2799 On x86 this is probably not needed. */ 2800#if defined __GNUC__ && __ICC >= 1200 && !(__MIC__ ||__MIC2__) 2801 _mm_sfence(); 2802#else 2803 __cilkrts_fence(); 2804#endif 2805 } 2806 2807 return saved_protected_tail; 2808} 2809 2810/************************************************************* 2811 Initialization and startup 2812*************************************************************/ 2813 2814__cilkrts_worker *make_worker(global_state_t *g, 2815 int self, __cilkrts_worker *w) 2816{ 2817 w->self = self; 2818 w->g = g; 2819 2820 w->pedigree.rank = 0; // Initial rank is 0 2821 w->pedigree.parent = NULL; 2822 2823 w->l = (local_state *)__cilkrts_malloc(sizeof(*w->l)); 2824 2825 __cilkrts_frame_malloc_per_worker_init(w); 2826 2827 w->reducer_map = NULL; 2828 w->current_stack_frame = NULL; 2829 w->reserved = NULL; 2830 2831 w->l->worker_magic_0 = WORKER_MAGIC_0; 2832 w->l->team = NULL; 2833 w->l->type = WORKER_FREE; 2834 2835 __cilkrts_mutex_init(&w->l->lock); 2836 __cilkrts_mutex_init(&w->l->steal_lock); 2837 w->l->do_not_steal = 0; 2838 w->l->frame_ff = 0; 2839 w->l->next_frame_ff = 0; 2840 w->l->last_full_frame = NULL; 2841 2842 w->l->ltq = (__cilkrts_stack_frame **) 2843 __cilkrts_malloc(g->ltqsize * sizeof(*w->l->ltq)); 2844 w->ltq_limit = w->l->ltq + g->ltqsize; 2845 w->head = w->tail = w->l->ltq; 2846 2847 cilk_fiber_pool_init(&w->l->fiber_pool, 2848 &g->fiber_pool, 2849 g->stack_size, 2850 g->fiber_pool_size, 2851 0, // alloc_max is 0. We don't allocate from the heap directly without checking the parent pool. 2852 0); 2853#if FIBER_DEBUG >= 2 2854 fprintf(stderr, "ThreadId=%p: Making w=%d (%p), pool = %p\n", 2855 cilkos_get_current_thread_id(), 2856 w->self, w, 2857 &w->l->fiber_pool); 2858#endif 2859 w->l->scheduling_fiber = NULL; 2860 w->l->original_pedigree_leaf = NULL; 2861 w->l->rand_seed = 0; /* the scheduler will overwrite this field */ 2862 2863 w->l->post_suspend = 0; 2864 w->l->suspended_stack = 0; 2865 w->l->fiber_to_free = NULL; 2866 w->l->pending_exception = NULL; 2867 2868#if CILK_PROFILE 2869 w->l->stats = __cilkrts_malloc(sizeof(statistics)); 2870 __cilkrts_init_stats(w->l->stats); 2871#else 2872 w->l->stats = NULL; 2873#endif 2874 w->l->steal_failure_count = 0; 2875 2876 w->l->work_stolen = 0; 2877 2878 // Initialize record/replay assuming we're doing neither 2879 w->l->record_replay_fptr = NULL; 2880 w->l->replay_list_root = NULL; 2881 w->l->replay_list_entry = NULL; 2882 w->l->signal_node = NULL; 2883 // Nothing's been stolen yet 2884 w->l->worker_magic_1 = WORKER_MAGIC_1; 2885 2886 /*w->parallelism_disabled = 0;*/ 2887 2888 // Allow stealing all frames. Sets w->saved_protected_tail 2889 __cilkrts_restore_stealing(w, w->ltq_limit); 2890 2891 __cilkrts_init_worker_sysdep(w); 2892 2893 reset_THE_exception(w); 2894 2895 return w; 2896} 2897 2898void destroy_worker(__cilkrts_worker *w) 2899{ 2900 CILK_ASSERT (NULL == w->l->pending_exception); 2901 2902 // Deallocate the scheduling fiber 2903 if (NULL != w->l->scheduling_fiber) 2904 { 2905 // The scheduling fiber is the main fiber for system workers and must 2906 // be deallocated by the thread that created it. Thus, we can 2907 // deallocate only free workers' (formerly user workers) scheduling 2908 // fibers here. 2909 CILK_ASSERT(WORKER_FREE == w->l->type); 2910 2911#if FIBER_DEBUG >=1 2912 fprintf(stderr, "ThreadId=%p, w=%p, %d, deallocating scheduling fiber = %p, \n", 2913 cilkos_get_current_thread_id(), 2914 w, 2915 w->self, 2916 w->l->scheduling_fiber); 2917#endif 2918 int ref_count = cilk_fiber_remove_reference(w->l->scheduling_fiber, NULL); 2919 // Scheduling fiber should never have extra references because of exceptions. 2920 CILK_ASSERT(0 == ref_count); 2921 w->l->scheduling_fiber = NULL; 2922 } 2923 2924#if CILK_PROFILE 2925 if (w->l->stats) { 2926 __cilkrts_free(w->l->stats); 2927 } 2928#else 2929 CILK_ASSERT(NULL == w->l->stats); 2930#endif 2931 2932 /* Free any cached fibers. */ 2933 cilk_fiber_pool_destroy(&w->l->fiber_pool); 2934 2935 __cilkrts_destroy_worker_sysdep(w); 2936 2937 if (w->l->signal_node) { 2938 CILK_ASSERT(WORKER_SYSTEM == w->l->type); 2939 signal_node_destroy(w->l->signal_node); 2940 } 2941 2942 __cilkrts_free(w->l->ltq); 2943 __cilkrts_mutex_destroy(0, &w->l->lock); 2944 __cilkrts_mutex_destroy(0, &w->l->steal_lock); 2945 __cilkrts_frame_malloc_per_worker_cleanup(w); 2946 2947 __cilkrts_free(w->l); 2948 2949 // The caller is responsible for freeing the worker memory 2950} 2951 2952/* 2953 * Make a worker into a system worker. 2954 */ 2955static void make_worker_system(__cilkrts_worker *w) { 2956 CILK_ASSERT(WORKER_FREE == w->l->type); 2957 w->l->type = WORKER_SYSTEM; 2958 w->l->signal_node = signal_node_create(); 2959} 2960 2961void __cilkrts_deinit_internal(global_state_t *g) 2962{ 2963 int i; 2964 __cilkrts_worker *w; 2965 2966 // If there's no global state then we're done 2967 if (NULL == g) 2968 return; 2969 2970#ifdef CILK_PROFILE 2971 __cilkrts_dump_stats_to_stderr(g); 2972#endif 2973 2974 w = g->workers[0]; 2975 if (w->l->frame_ff) { 2976 __cilkrts_destroy_full_frame(w, w->l->frame_ff); 2977 w->l->frame_ff = 0; 2978 } 2979 2980 // Release any resources used for record/replay 2981 replay_term(g); 2982 2983 // Destroy any system dependent global state 2984 __cilkrts_destroy_global_sysdep(g); 2985 2986 for (i = 0; i < g->total_workers; ++i) 2987 destroy_worker(g->workers[i]); 2988 2989 // Free memory for all worker blocks which were allocated contiguously 2990 __cilkrts_free(g->workers[0]); 2991 2992 __cilkrts_free(g->workers); 2993 2994 cilk_fiber_pool_destroy(&g->fiber_pool); 2995 __cilkrts_frame_malloc_global_cleanup(g); 2996 2997 cilkg_deinit_global_state(); 2998} 2999 3000/* 3001 * Wake the runtime by notifying the system workers that they can steal. The 3002 * first user worker into the runtime should call this. 3003 */ 3004static void wake_runtime(global_state_t *g) 3005{ 3006 __cilkrts_worker *root; 3007 if (g->P > 1) { 3008 // Send a message to the root node. The message will propagate. 3009 root = g->workers[0]; 3010 CILK_ASSERT(root->l->signal_node); 3011 signal_node_msg(root->l->signal_node, 1); 3012 } 3013} 3014 3015/* 3016 * Put the runtime to sleep. The last user worker out of the runtime should 3017 * call this. Like Dad always said, turn out the lights when nobody's in the 3018 * room. 3019 */ 3020static void sleep_runtime(global_state_t *g) 3021{ 3022 __cilkrts_worker *root; 3023 if (g->P > 1) { 3024 // Send a message to the root node. The message will propagate. 3025 root = g->workers[0]; 3026 CILK_ASSERT(root->l->signal_node); 3027 signal_node_msg(root->l->signal_node, 0); 3028 } 3029} 3030 3031/* Called when a user thread joins Cilk. 3032 Global lock must be held. */ 3033void __cilkrts_enter_cilk(global_state_t *g) 3034{ 3035 if (g->Q++ == 0) { 3036 // If this is the first user thread to enter Cilk wake 3037 // up all the workers. 3038 wake_runtime(g); 3039 } 3040} 3041 3042/* Called when a user thread leaves Cilk. 3043 Global lock must be held. */ 3044void __cilkrts_leave_cilk(global_state_t *g) 3045{ 3046 if (--g->Q == 0) { 3047 // Put the runtime to sleep. 3048 sleep_runtime(g); 3049 } 3050} 3051 3052/* 3053 * worker_runnable 3054 * 3055 * Return true if the worker should continue to try to steal. False, otherwise. 3056 */ 3057 3058NOINLINE 3059static enum schedule_t worker_runnable(__cilkrts_worker *w) 3060{ 3061 global_state_t *g = w->g; 3062 3063 /* If this worker has something to do, do it. 3064 Otherwise the work would be lost. */ 3065 if (w->l->next_frame_ff) 3066 return SCHEDULE_RUN; 3067 3068 // If Cilk has explicitly (by the user) been told to exit (i.e., by 3069 // __cilkrts_end_cilk() -> __cilkrts_stop_workers(g)), then return 0. 3070 if (g->work_done) 3071 return SCHEDULE_EXIT; 3072 3073 if (0 == w->self) { 3074 // This worker is the root node and is the only one that may query the 3075 // global state to see if there are still any user workers in Cilk. 3076 if (w->l->steal_failure_count > g->max_steal_failures) { 3077 if (signal_node_should_wait(w->l->signal_node)) { 3078 return SCHEDULE_WAIT; 3079 } else { 3080 // Reset the steal_failure_count since we have verified that 3081 // user workers are still in Cilk. 3082 w->l->steal_failure_count = 0; 3083 } 3084 } 3085 } else if (WORKER_SYSTEM == w->l->type && 3086 signal_node_should_wait(w->l->signal_node)) { 3087 // This worker has been notified by its parent that it should stop 3088 // trying to steal. 3089 return SCHEDULE_WAIT; 3090 } 3091 3092 return SCHEDULE_RUN; 3093} 3094 3095 3096 3097// Initialize the worker structs, but don't start the workers themselves. 3098static void init_workers(global_state_t *g) 3099{ 3100 int total_workers = g->total_workers; 3101 int i; 3102 struct CILK_ALIGNAS(256) buffered_worker { 3103 __cilkrts_worker w; 3104 char buf[64]; 3105 } *workers_memory; 3106 3107 /* not needed if only one worker */ 3108 cilk_fiber_pool_init(&g->fiber_pool, 3109 NULL, 3110 g->stack_size, 3111 g->global_fiber_pool_size, // buffer_size 3112 g->max_stacks, // maximum # to allocate 3113 1); 3114 3115 cilk_fiber_pool_set_fiber_limit(&g->fiber_pool, 3116 (g->max_stacks ? g->max_stacks : INT_MAX)); 3117 3118 g->workers = (__cilkrts_worker **) 3119 __cilkrts_malloc(total_workers * sizeof(*g->workers)); 3120 3121 // Allocate 1 block of memory for workers to make life easier for tools 3122 // like Inspector which run multithreaded and need to know the memory 3123 // range for all the workers that will be accessed in a user's program 3124 workers_memory = (struct buffered_worker*) 3125 __cilkrts_malloc(sizeof(*workers_memory) * total_workers); 3126 3127 // Notify any tools that care (Cilkscreen and Inspector) that they should 3128 // ignore memory allocated for the workers 3129 __cilkrts_cilkscreen_ignore_block(&workers_memory[0], 3130 &workers_memory[total_workers]); 3131 3132 // Initialize worker structs, including unused worker slots. 3133 for (i = 0; i < total_workers; ++i) { 3134 g->workers[i] = make_worker(g, i, &workers_memory[i].w); 3135 } 3136 3137 // Set the workers in the first P - 1 slots to be system workers. 3138 // Remaining worker structs already have type == 0. 3139 for (i = 0; i < g->system_workers; ++i) { 3140 make_worker_system(g->workers[i]); 3141 } 3142} 3143 3144void __cilkrts_init_internal(int start) 3145{ 3146 global_state_t *g = NULL; 3147 3148 if (cilkg_is_published()) { 3149 g = cilkg_init_global_state(); 3150 } 3151 else { 3152 3153 // We think the state has not been published yet. 3154 // Grab the lock and try to initialize/publish. 3155 global_os_mutex_lock(); 3156 3157 if (cilkg_is_published()) { 3158 // Some other thread must have snuck in and published. 3159 g = cilkg_init_global_state(); 3160 } 3161 else { 3162 // Initialize and retrieve global state 3163 g = cilkg_init_global_state(); 3164 3165 // Set the scheduler pointer 3166 g->scheduler = worker_scheduler_function; 3167 3168 // If we're running under a sequential P-Tool (Cilkscreen or 3169 // Cilkview) then there's only one worker and we need to tell 3170 // the tool about the extent of the stack 3171 if (g->under_ptool) 3172 __cilkrts_establish_c_stack(); 3173 init_workers(g); 3174 3175 // Initialize per-work record/replay logging 3176 replay_init_workers(g); 3177 3178 // Initialize any system dependent global state 3179 __cilkrts_init_global_sysdep(g); 3180 3181 3182 cilkg_publish_global_state(g); 3183 } 3184 3185 global_os_mutex_unlock(); 3186 } 3187 3188 CILK_ASSERT(g); 3189 3190 if (start && !g->workers_running) 3191 { 3192 // Acquire the global OS mutex while we're starting the workers 3193 global_os_mutex_lock(); 3194 if (!g->workers_running) 3195 // Start P - 1 system workers since P includes the first user 3196 // worker. 3197 __cilkrts_start_workers(g, g->P - 1); 3198 global_os_mutex_unlock(); 3199 } 3200} 3201 3202 3203/************************************************************************ 3204 Methods for reducer protocol. 3205 3206 Reductions occur in two places: 3207 A. A full frame "ff" is returning from a spawn with a stolen parent. 3208 B. A full frame "ff" is stalling at a sync. 3209 3210 To support parallel reductions, reduction functions need to be 3211 executed while control is on a user stack, before jumping into the 3212 runtime. These reductions can not occur while holding a worker or 3213 frame lock. 3214 3215 Before a worker w executes a reduction in either Case A or B, w's 3216 deque is empty. 3217 3218 Since parallel reductions push work onto the deque, we must do extra 3219 work to set up runtime data structures properly before reductions 3220 begin to allow stealing. ( Normally, when we have only serial 3221 reductions, once a worker w starts a reduction, its deque remains 3222 empty until w either steals another frame or resumes a suspended 3223 frame. Thus, we don't care about the state of the deque, since w 3224 will reset its deque when setting up execution of a frame. ) 3225 3226 To allow for parallel reductions, we coerce the runtime data 3227 structures so that, from their perspective, it looks as though we 3228 have spliced in an "execute_reductions()" function. Consider the 3229 two cases for reductions: 3230 3231 Case A: Return from a spawn with a stolen parent. 3232 Consider a spawned function g is returning on a worker w. 3233 Assume: 3234 - g was spawned from a parent function f. 3235 - ff is the full frame for g's spawn helper 3236 - sf be the __cilkrts_stack_frame for g's spawn helper. 3237 3238 We are conceptually splicing "execute_reductions()" so that it 3239 occurs immediately before the spawn helper of g returns to f. 3240 3241 We do so by creating two different world views --- one for the 3242 runtime data structures, and one for the actual control flow. 3243 3244 - Before reductions begin, the runtime data structures should 3245 look as though the spawn helper of g is calling 3246 "execute_reductions()", in terms of both the user stack and 3247 worker deque. More precisely, w should satisfy the 3248 following properties: 3249 3250 (a) w has ff as its full frame, 3251 (b) w has sf as its __cilkrts_stack_frame, and 3252 (c) w has an empty deque. 3253 3254 If the runtime satisfies these properties, then if w 3255 encounters a spawn in a parallel reduction, it can push onto 3256 a valid deque. Also, when a steal from w occurs, it will 3257 build the correct tree of full frames when w is stolen from. 3258 3259 - In actual control flow, however, once the 3260 "execute_reductions()" function returns, it is actually 3261 returning to runtime code instead of g's spawn helper. 3262 3263 At the point a worker w began executing reductions, the 3264 control flow / compiled code had already finished g's spawn 3265 helper, and w was about to enter the runtime. With parallel 3266 reductions, some worker v (which might be different from w) 3267 is the one returning to the runtime. 3268 3269 3270 The reduction logic consists of 4 steps: 3271 3272 A1. Restore runtime data structures to make it look as though 3273 the spawn helper of g() is still the currently executing 3274 frame for w. 3275 3276 A2. Execute reductions on the user stack. Reductions also 3277 includes the logic for exceptions and stacks. Note that 3278 reductions start on w, but may finish on a different 3279 worker if there is parallelism in the reduce. 3280 3281 A3. Splice out ff from the tree of full frames. 3282 3283 A4. Jump into the runtime/scheduling stack and execute 3284 "do_return_from_spawn". This method 3285 3286 (a) Frees the user stack we were just on if it is no longer needed. 3287 (b) Decrement the join counter on ff->parent, and tries to do a 3288 provably good steal. 3289 (c) Clean up the full frame ff. 3290 3291 3292 Case B: Stalling at a sync. 3293 3294 Consider a function g(), with full frame ff and 3295 __cilkrts_stack_frame sf. Suppose g() stalls at a sync, and we 3296 are executing reductions. 3297 3298 Conceptually, we are splicing in an "execute_reductions()" 3299 function into g() as the last action that g() takes immediately 3300 before it executes the cilk_sync. 3301 3302 The reduction logic for this case is similar to Case A. 3303 3304 B1. Restore the runtime data structures. 3305 3306 The main difference from Case A is that ff/sf is still a 3307 frame that needs to be executed later (since it is stalling 3308 at a cilk_sync). Thus, we also need to save the current 3309 stack information into "ff" so that we can correctly resume 3310 execution of "ff" after the sync. 3311 3312 B2. Execute reductions on the user stack. 3313 3314 B3. No frame to splice out of the tree. 3315 3316 B4. Jump into the runtime/scheduling stack and execute "do_sync". 3317 This method: 3318 (a) Frees the user stack we were just on if it is no longer needed. 3319 (b) Tries to execute a provably good steal. 3320 3321 Finally, for the reducer protocol, we consider two reduction paths, 3322 namely a "fast" and "slow" path. On a fast path, only trivial 3323 merges of reducer maps happen (i.e., one or both of the maps are 3324 NULL). Otherwise, on the slow path, a reduction actually needs to 3325 happen. 3326 3327*****************************************************************/ 3328 3329/** 3330 * @brief Locations to store the result of a reduction. 3331 * 3332 * Struct storing pointers to the fields in our "left" sibling that we 3333 * should update when splicing out a full frame or stalling at a sync. 3334 */ 3335typedef struct { 3336 /** A pointer to the location of our left reducer map. */ 3337 struct cilkred_map **map_ptr; 3338 3339 /** A pointer to the location of our left exception. */ 3340 struct pending_exception_info **exception_ptr; 3341} splice_left_ptrs; 3342 3343/** 3344 * For a full frame returning from a spawn, calculate the pointers to 3345 * the maps and exceptions to my left. 3346 * 3347 * @param w The currently executing worker. 3348 * @param ff Full frame that is dying 3349 * @return Pointers to our "left" for reducers and exceptions. 3350 */ 3351static inline 3352splice_left_ptrs compute_left_ptrs_for_spawn_return(__cilkrts_worker *w, 3353 full_frame *ff) 3354{ 3355 // ASSERT: we hold the lock on ff->parent 3356 3357 splice_left_ptrs left_ptrs; 3358 if (ff->left_sibling) { 3359 left_ptrs.map_ptr = &ff->left_sibling->right_reducer_map; 3360 left_ptrs.exception_ptr = &ff->left_sibling->right_pending_exception; 3361 } 3362 else { 3363 full_frame *parent_ff = ff->parent; 3364 left_ptrs.map_ptr = &parent_ff->children_reducer_map; 3365 left_ptrs.exception_ptr = &parent_ff->child_pending_exception; 3366 } 3367 return left_ptrs; 3368} 3369 3370/** 3371 * For a full frame at a sync, calculate the pointers to the maps and 3372 * exceptions to my left. 3373 * 3374 * @param w The currently executing worker. 3375 * @param ff Full frame that is stalling at a sync. 3376 * @return Pointers to our "left" for reducers and exceptions. 3377 */ 3378static inline 3379splice_left_ptrs compute_left_ptrs_for_sync(__cilkrts_worker *w, 3380 full_frame *ff) 3381{ 3382 // ASSERT: we hold the lock on ff 3383 splice_left_ptrs left_ptrs; 3384 3385 // Figure out which map to the left we should merge into. 3386 if (ff->rightmost_child) { 3387 CILK_ASSERT(ff->rightmost_child->parent == ff); 3388 left_ptrs.map_ptr = &(ff->rightmost_child->right_reducer_map); 3389 left_ptrs.exception_ptr = &(ff->rightmost_child->right_pending_exception); 3390 } 3391 else { 3392 // We have no children. Then, we should be the last 3393 // worker at the sync... "left" is our child map. 3394 left_ptrs.map_ptr = &(ff->children_reducer_map); 3395 left_ptrs.exception_ptr = &(ff->child_pending_exception); 3396 } 3397 return left_ptrs; 3398} 3399 3400/** 3401 * After we have completed all reductions on a spawn return, call this 3402 * method to finish up before jumping into the runtime. 3403 * 3404 * 1. Perform the "reduction" on stacks, i.e., execute the left 3405 * holder logic to pass the leftmost stack up. 3406 * 3407 * w->l->fiber_to_free holds any stack that needs to be freed 3408 * when control switches into the runtime fiber. 3409 * 3410 * 2. Unlink and remove child_ff from the tree of full frames. 3411 * 3412 * @param w The currently executing worker. 3413 * @param parent_ff The parent of child_ff. 3414 * @param child_ff The full frame returning from a spawn. 3415 */ 3416static inline 3417void finish_spawn_return_on_user_stack(__cilkrts_worker *w, 3418 full_frame *parent_ff, 3419 full_frame *child_ff) 3420{ 3421 CILK_ASSERT(w->l->fiber_to_free == NULL); 3422 3423 // Execute left-holder logic for stacks. 3424 if (child_ff->left_sibling || parent_ff->fiber_child) { 3425 // Case where we are not the leftmost stack. 3426 CILK_ASSERT(parent_ff->fiber_child != child_ff->fiber_self); 3427 3428 // Remember any fiber we need to free in the worker. 3429 // After we jump into the runtime, we will actually do the 3430 // free. 3431 w->l->fiber_to_free = child_ff->fiber_self; 3432 } 3433 else { 3434 // We are leftmost, pass stack/fiber up to parent. 3435 // Thus, no stack/fiber to free. 3436 parent_ff->fiber_child = child_ff->fiber_self; 3437 w->l->fiber_to_free = NULL; 3438 } 3439 3440 child_ff->fiber_self = NULL; 3441 3442 unlink_child(parent_ff, child_ff); 3443} 3444 3445 3446/** 3447 * Executes any fast reductions necessary to splice ff out of the tree 3448 * of full frames. 3449 * 3450 * This "fast" path performs only trivial merges of reducer maps, 3451 * i.e,. when one of them is NULL. 3452 * (See slow_path_reductions_for_spawn_return() for slow path.) 3453 * 3454 * Returns: 1 if we finished all reductions. 3455 * Returns: 0 if there are still reductions to execute, and 3456 * we should execute the slow path. 3457 * 3458 * This method assumes w holds the frame lock on parent_ff. 3459 * After this method completes: 3460 * 1. We have spliced ff out of the tree of full frames. 3461 * 2. The reducer maps of child_ff have been deposited 3462 * "left" according to the reducer protocol. 3463 * 3. w->l->stack_to_free stores the stack 3464 * that needs to be freed once we jump into the runtime. 3465 * 3466 * We have not, however, decremented the join counter on ff->parent. 3467 * This prevents any other workers from resuming execution of the parent. 3468 * 3469 * @param w The currently executing worker. 3470 * @param ff The full frame returning from a spawn. 3471 * @return NULL if we finished all reductions. 3472 * @return The address where the left map is stored (which should be passed to 3473 * slow_path_reductions_for_spawn_return()) if there are 3474 * still reductions to execute. 3475 */ 3476struct cilkred_map** 3477fast_path_reductions_for_spawn_return(__cilkrts_worker *w, 3478 full_frame *ff) 3479{ 3480 // ASSERT: we hold ff->parent->lock. 3481 splice_left_ptrs left_ptrs; 3482 3483 CILK_ASSERT(NULL == w->l->pending_exception); 3484 3485 // Figure out the pointers to the left where I want 3486 // to put reducers and exceptions. 3487 left_ptrs = compute_left_ptrs_for_spawn_return(w, ff); 3488 3489 // Go ahead and merge exceptions while holding the lock. 3490 splice_exceptions_for_spawn(w, ff, left_ptrs.exception_ptr); 3491 3492 // Now check if we have any reductions to perform. 3493 // 3494 // Consider all the cases of left, middle and right maps. 3495 // 0. (-, -, -) : finish and return 1 3496 // 1. (L, -, -) : finish and return 1 3497 // 2. (-, M, -) : slide over to left, finish, and return 1. 3498 // 3. (L, M, -) : return 0 3499 // 4. (-, -, R) : slide over to left, finish, and return 1. 3500 // 5. (L, -, R) : return 0 3501 // 6. (-, M, R) : return 0 3502 // 7. (L, M, R) : return 0 3503 // 3504 // In terms of code: 3505 // L == *left_ptrs.map_ptr 3506 // M == w->reducer_map 3507 // R == f->right_reducer_map. 3508 // 3509 // The goal of the code below is to execute the fast path with 3510 // as few branches and writes as possible. 3511 3512 int case_value = (*(left_ptrs.map_ptr) != NULL); 3513 case_value += ((w->reducer_map != NULL) << 1); 3514 case_value += ((ff->right_reducer_map != NULL) << 2); 3515 3516 // Fastest path is case_value == 0 or 1. 3517 if (case_value >=2) { 3518 switch (case_value) { 3519 case 2: 3520 *(left_ptrs.map_ptr) = w->reducer_map; 3521 w->reducer_map = NULL; 3522 return NULL; 3523 break; 3524 case 4: 3525 *(left_ptrs.map_ptr) = ff->right_reducer_map; 3526 ff->right_reducer_map = NULL; 3527 return NULL; 3528 default: 3529 // If we have to execute the slow path, then 3530 // return the pointer to the place to deposit the left 3531 // map. 3532 return left_ptrs.map_ptr; 3533 } 3534 } 3535 3536 // Do nothing 3537 return NULL; 3538} 3539 3540 3541/** 3542 * Executes any reductions necessary to splice "ff" frame out of 3543 * the steal tree. 3544 * 3545 * This method executes the "slow" path for reductions on a spawn 3546 * return, i.e., there are non-NULL maps that need to be merged 3547 * together. 3548 * 3549 * This method should execute only if 3550 * fast_path_reductions_for_spawn_return() returns a non-NULL 3551 * left_map_ptr. 3552 * 3553 * Upon entry, left_map_ptr should be the location of the left map 3554 * at the start of the reduction, as calculated by 3555 * fast_path_reductions_for_spawn_return(). 3556 * 3557 * After this method completes: 3558 * 1. We have spliced ff out of the tree of full frames. 3559 * 2. The reducer maps of child_ff have been deposited 3560 * "left" according to the reducer protocol. 3561 * 3. w->l->stack_to_free stores the stack 3562 * that needs to be freed once we jump into the runtime. 3563 * We have not, however, decremented the join counter on ff->parent, 3564 * so no one can resume execution of the parent yet. 3565 * 3566 * WARNING: 3567 * This method assumes the lock on ff->parent is held upon entry, and 3568 * Upon exit, the worker that returns still holds a lock on ff->parent 3569 * This method can, however, release and reacquire the lock on ff->parent. 3570 * 3571 * @param w The currently executing worker. 3572 * @param ff The full frame returning from a spawn. 3573 * @param left_map_ptr Pointer to our initial left map. 3574 * @return The worker that this method returns on. 3575 */ 3576static __cilkrts_worker* 3577slow_path_reductions_for_spawn_return(__cilkrts_worker *w, 3578 full_frame *ff, 3579 struct cilkred_map **left_map_ptr) 3580{ 3581 3582 // CILK_ASSERT: w is holding frame lock on parent_ff. 3583#if REDPAR_DEBUG > 0 3584 CILK_ASSERT(!ff->rightmost_child); 3585 CILK_ASSERT(!ff->is_call_child); 3586#endif 3587 3588 // Loop invariant: 3589 // When beginning this loop, we should 3590 // 1. Be holding the lock on ff->parent. 3591 // 2. left_map_ptr should be the address of the pointer to the left map. 3592 // 3. All maps should be slid over left by one, if possible. 3593 // 4. All exceptions should be merged so far. 3594 while (1) { 3595 3596 // Slide middle map left if possible. 3597 if (!(*left_map_ptr)) { 3598 *left_map_ptr = w->reducer_map; 3599 w->reducer_map = NULL; 3600 } 3601 // Slide right map to middle if possible. 3602 if (!w->reducer_map) { 3603 w->reducer_map = ff->right_reducer_map; 3604 ff->right_reducer_map = NULL; 3605 } 3606 3607 // Since we slid everything left by one, 3608 // we are finished if there is no middle map. 3609 if (!w->reducer_map) { 3610 verify_current_wkr(w); 3611 return w; 3612 } 3613 else { 3614 struct cilkred_map* left_map; 3615 struct cilkred_map* middle_map; 3616 struct cilkred_map* right_map; 3617 3618 // Take all the maps from their respective locations. 3619 // We can't leave them in place and execute a reduction because these fields 3620 // might change once we release the lock. 3621 left_map = *left_map_ptr; 3622 *left_map_ptr = NULL; 3623 middle_map = w->reducer_map; 3624 w->reducer_map = NULL; 3625 right_map = ff->right_reducer_map; 3626 ff->right_reducer_map = NULL; 3627 3628 // WARNING!!! Lock release here. 3629 // We have reductions to execute (and we can't hold locks). 3630 __cilkrts_frame_unlock(w, ff->parent); 3631 3632 // Merge all reducers into the left map. 3633 left_map = repeated_merge_reducer_maps(&w, 3634 left_map, 3635 middle_map); 3636 verify_current_wkr(w); 3637 left_map = repeated_merge_reducer_maps(&w, 3638 left_map, 3639 right_map); 3640 verify_current_wkr(w); 3641 CILK_ASSERT(NULL == w->reducer_map); 3642 // Put the final answer back into w->reducer_map. 3643 w->reducer_map = left_map; 3644 3645 // Save any exceptions generated because of the reduction 3646 // process from the returning worker. These get merged 3647 // the next time around the loop. 3648 CILK_ASSERT(NULL == ff->pending_exception); 3649 ff->pending_exception = w->l->pending_exception; 3650 w->l->pending_exception = NULL; 3651 3652 // Lock ff->parent for the next loop around. 3653 __cilkrts_frame_lock(w, ff->parent); 3654 3655 // Once we have the lock again, recompute who is to our 3656 // left. 3657 splice_left_ptrs left_ptrs; 3658 left_ptrs = compute_left_ptrs_for_spawn_return(w, ff); 3659 3660 // Update the pointer for the left map. 3661 left_map_ptr = left_ptrs.map_ptr; 3662 // Splice the exceptions for spawn. 3663 splice_exceptions_for_spawn(w, ff, left_ptrs.exception_ptr); 3664 } 3665 } 3666 // We should never break out of this loop. 3667 3668 CILK_ASSERT(0); 3669 return NULL; 3670} 3671 3672 3673 3674/** 3675 * Execute reductions when returning from a spawn whose parent has 3676 * been stolen. 3677 * 3678 * Execution may start on w, but may finish on a different worker. 3679 * This method acquires/releases the lock on ff->parent. 3680 * 3681 * @param w The currently executing worker. 3682 * @param ff The full frame of the spawned function that is returning. 3683 * @param returning_sf The __cilkrts_stack_frame for this returning function. 3684 * @return The worker returning from this method. 3685 */ 3686static __cilkrts_worker* 3687execute_reductions_for_spawn_return(__cilkrts_worker *w, 3688 full_frame *ff, 3689 __cilkrts_stack_frame *returning_sf) 3690{ 3691 // Step A1 from reducer protocol described above. 3692 // 3693 // Coerce the runtime into thinking that 3694 // ff/returning_sf are still on the bottom of 3695 // w's deque. 3696 restore_frame_for_spawn_return_reduction(w, ff, returning_sf); 3697 3698 // Step A2 and A3: Execute reductions on user stack. 3699 BEGIN_WITH_FRAME_LOCK(w, ff->parent) { 3700 struct cilkred_map **left_map_ptr; 3701 left_map_ptr = fast_path_reductions_for_spawn_return(w, ff); 3702 3703 // Pointer will be non-NULL if there are 3704 // still reductions to execute. 3705 if (left_map_ptr) { 3706 // WARNING: This method call may release the lock 3707 // on ff->parent and re-acquire it (possibly on a 3708 // different worker). 3709 // We can't hold locks while actually executing 3710 // reduce functions. 3711 w = slow_path_reductions_for_spawn_return(w, 3712 ff, 3713 left_map_ptr); 3714 verify_current_wkr(w); 3715 } 3716 3717 finish_spawn_return_on_user_stack(w, ff->parent, ff); 3718 // WARNING: the use of this lock macro is deceptive. 3719 // The worker may have changed here. 3720 } END_WITH_FRAME_LOCK(w, ff->parent); 3721 return w; 3722} 3723 3724 3725 3726/** 3727 * Execute fast "reductions" when ff stalls at a sync. 3728 * 3729 * @param w The currently executing worker. 3730 * @param ff The full frame stalling at a sync. 3731 * @return 1 if we are finished with all reductions after calling this method. 3732 * @return 0 if we still need to execute the slow path reductions. 3733 */ 3734static inline 3735int fast_path_reductions_for_sync(__cilkrts_worker *w, 3736 full_frame *ff) { 3737 // Return 0 if there is some reduction that needs to happen. 3738 return !(w->reducer_map || ff->pending_exception); 3739} 3740 3741/** 3742 * Executes slow reductions when ff stalls at a sync. 3743 * This method should execute only if 3744 * fast_path_reductions_for_sync(w, ff) returned 0. 3745 * 3746 * After this method completes: 3747 * 1. ff's current reducer map has been deposited into 3748 * right_reducer_map of ff's rightmost child, or 3749 * ff->children_reducer_map if ff has no children. 3750 * 2. Similarly for ff's current exception. 3751 * 3. Nothing to calculate for stacks --- if we are stalling 3752 * we will always free a stack. 3753 * 3754 * This method may repeatedly acquire/release the lock on ff. 3755 * 3756 * @param w The currently executing worker. 3757 * @param ff The full frame stalling at a sync. 3758 * @return The worker returning from this method. 3759 */ 3760static __cilkrts_worker* 3761slow_path_reductions_for_sync(__cilkrts_worker *w, 3762 full_frame *ff) 3763{ 3764 struct cilkred_map *left_map; 3765 struct cilkred_map *middle_map; 3766 3767#if (REDPAR_DEBUG > 0) 3768 CILK_ASSERT(ff); 3769 CILK_ASSERT(w->head == w->tail); 3770#endif 3771 3772 middle_map = w->reducer_map; 3773 w->reducer_map = NULL; 3774 3775 // Loop invariant: middle_map should be valid (the current map to reduce). 3776 // left_map is junk. 3777 // w->reducer_map == NULL. 3778 while (1) { 3779 BEGIN_WITH_FRAME_LOCK(w, ff) { 3780 splice_left_ptrs left_ptrs = compute_left_ptrs_for_sync(w, ff); 3781 3782 // Grab the "left" map and store pointers to those locations. 3783 left_map = *(left_ptrs.map_ptr); 3784 *(left_ptrs.map_ptr) = NULL; 3785 3786 // Slide the maps in our struct left as far as possible. 3787 if (!left_map) { 3788 left_map = middle_map; 3789 middle_map = NULL; 3790 } 3791 3792 *(left_ptrs.exception_ptr) = 3793 __cilkrts_merge_pending_exceptions(w, 3794 *left_ptrs.exception_ptr, 3795 ff->pending_exception); 3796 ff->pending_exception = NULL; 3797 3798 // If there is no middle map, then we are done. 3799 // Deposit left and return. 3800 if (!middle_map) { 3801 *(left_ptrs).map_ptr = left_map; 3802 #if (REDPAR_DEBUG > 0) 3803 CILK_ASSERT(NULL == w->reducer_map); 3804 #endif 3805 // Sanity check upon leaving the loop. 3806 verify_current_wkr(w); 3807 // Make sure to unlock before we return! 3808 __cilkrts_frame_unlock(w, ff); 3809 return w; 3810 } 3811 } END_WITH_FRAME_LOCK(w, ff); 3812 3813 // If we get here, we have a nontrivial reduction to execute. 3814 middle_map = repeated_merge_reducer_maps(&w, 3815 left_map, 3816 middle_map); 3817 verify_current_wkr(w); 3818 3819 // Save any exceptions generated because of the reduction 3820 // process. These get merged the next time around the 3821 // loop. 3822 CILK_ASSERT(NULL == ff->pending_exception); 3823 ff->pending_exception = w->l->pending_exception; 3824 w->l->pending_exception = NULL; 3825 } 3826 3827 // We should never break out of the loop above. 3828 CILK_ASSERT(0); 3829 return NULL; 3830} 3831 3832 3833/** 3834 * Execute reductions when ff stalls at a sync. 3835 * 3836 * Execution starts on w, but may finish on a different worker. 3837 * This method may acquire/release the lock on ff. 3838 * 3839 * @param w The currently executing worker. 3840 * @param ff The full frame of the spawned function at the sync 3841 * @param sf_at_sync The __cilkrts_stack_frame stalling at a sync 3842 * @return The worker returning from this method. 3843 */ 3844static __cilkrts_worker* 3845execute_reductions_for_sync(__cilkrts_worker *w, 3846 full_frame *ff, 3847 __cilkrts_stack_frame *sf_at_sync) 3848{ 3849 int finished_reductions; 3850 // Step B1 from reducer protocol above: 3851 // Restore runtime invariants. 3852 // 3853 // The following code for this step is almost equivalent to 3854 // the following sequence: 3855 // 1. disown(w, ff, sf_at_sync, "sync") (which itself 3856 // calls make_unrunnable(w, ff, sf_at_sync)) 3857 // 2. make_runnable(w, ff, sf_at_sync). 3858 // 3859 // The "disown" will mark the frame "sf_at_sync" 3860 // as stolen and suspended, and save its place on the stack, 3861 // so it can be resumed after the sync. 3862 // 3863 // The difference is, that we don't want the disown to 3864 // break the following connections yet, since we are 3865 // about to immediately make sf/ff runnable again anyway. 3866 // sf_at_sync->worker == w 3867 // w->l->frame_ff == ff. 3868 // 3869 // These connections are needed for parallel reductions, since 3870 // we will use sf / ff as the stack frame / full frame for 3871 // executing any potential reductions. 3872 // 3873 // TBD: Can we refactor the disown / make_unrunnable code 3874 // to avoid the code duplication here? 3875 3876 ff->call_stack = NULL; 3877 3878 // Normally, "make_unrunnable" would add CILK_FRAME_STOLEN and 3879 // CILK_FRAME_SUSPENDED to sf_at_sync->flags and save the state of 3880 // the stack so that a worker can resume the frame in the correct 3881 // place. 3882 // 3883 // But on this path, CILK_FRAME_STOLEN should already be set. 3884 // Also, we technically don't want to suspend the frame until 3885 // the reduction finishes. 3886 // We do, however, need to save the stack before 3887 // we start any reductions, since the reductions might push more 3888 // data onto the stack. 3889 CILK_ASSERT(sf_at_sync->flags | CILK_FRAME_STOLEN); 3890 3891 __cilkrts_put_stack(ff, sf_at_sync); 3892 __cilkrts_make_unrunnable_sysdep(w, ff, sf_at_sync, 1, 3893 "execute_reductions_for_sync"); 3894 CILK_ASSERT(w->l->frame_ff == ff); 3895 3896 // Step B2: Execute reductions on user stack. 3897 // Check if we have any "real" reductions to do. 3898 finished_reductions = fast_path_reductions_for_sync(w, ff); 3899 3900 if (!finished_reductions) { 3901 // Still have some real reductions to execute. 3902 // Run them here. 3903 3904 // This method may acquire/release the lock on ff. 3905 w = slow_path_reductions_for_sync(w, ff); 3906 3907 // The previous call may return on a different worker. 3908 // than what we started on. 3909 verify_current_wkr(w); 3910 } 3911 3912#if REDPAR_DEBUG >= 0 3913 CILK_ASSERT(w->l->frame_ff == ff); 3914 CILK_ASSERT(ff->call_stack == NULL); 3915#endif 3916 3917 // Now we suspend the frame ff (since we've 3918 // finished the reductions). Roughly, we've split apart the 3919 // "make_unrunnable" call here --- we've already saved the 3920 // stack info earlier before the reductions execute. 3921 // All that remains is to restore the call stack back into the 3922 // full frame, and mark the frame as suspended. 3923 ff->call_stack = sf_at_sync; 3924 sf_at_sync->flags |= CILK_FRAME_SUSPENDED; 3925 3926 // At a nontrivial sync, we should always free the current fiber, 3927 // because it can not be leftmost. 3928 w->l->fiber_to_free = ff->fiber_self; 3929 ff->fiber_self = NULL; 3930 return w; 3931} 3932 3933 3934/* 3935 Local Variables: ** 3936 c-file-style:"bsd" ** 3937 c-basic-offset:4 ** 3938 indent-tabs-mode:nil ** 3939 End: ** 3940*/ 3941