1/* scheduler.c -*-C-*- 2 * 3 ************************************************************************* 4 * 5 * @copyright 6 * Copyright (C) 2007-2013, Intel Corporation 7 * All rights reserved. 8 * 9 * @copyright 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 14 * * Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * * Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in 18 * the documentation and/or other materials provided with the 19 * distribution. 20 * * Neither the name of Intel Corporation nor the names of its 21 * contributors may be used to endorse or promote products derived 22 * from this software without specific prior written permission. 23 * 24 * @copyright 25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 29 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 30 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 31 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 32 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 33 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 35 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 * 38 **************************************************************************/ 39 40/* 41 * Cilk scheduler 42 */ 43 44#include "scheduler.h" 45#include "bug.h" 46#include "os.h" 47#include "os_mutex.h" 48#include "local_state.h" 49#include "signal_node.h" 50#include "full_frame.h" 51#include "sysdep.h" 52#include "except.h" 53#include "cilk_malloc.h" 54#include "pedigrees.h" 55#include "record-replay.h" 56 57#include <limits.h> 58#include <string.h> /* memcpy */ 59#include <stdio.h> // sprintf 60#include <stdlib.h> // malloc, free, abort 61 62#ifdef _WIN32 63# pragma warning(disable:1786) // disable warning: sprintf is deprecated 64# include "sysdep-win.h" 65# include "except-win32.h" 66#endif // _WIN32 67 68// ICL: Don't complain about conversion from pointer to same-sized integral 69// type in __cilkrts_put_stack. That's why we're using ptrdiff_t 70#ifdef _WIN32 71# pragma warning(disable: 1684) 72#endif 73 74#include "cilk/cilk_api.h" 75#include "frame_malloc.h" 76#include "metacall_impl.h" 77#include "reducer_impl.h" 78#include "cilk-tbb-interop.h" 79#include "cilk-ittnotify.h" 80#include "stats.h" 81 82// ICL: Don't complain about loss of precision in myrand 83// I tried restoring the warning after the function, but it didn't 84// suppress it 85#ifdef _WIN32 86# pragma warning(disable: 2259) 87#endif 88 89#ifndef _WIN32 90# include <unistd.h> 91#endif 92 93#ifdef __VXWORKS__ 94// redeclare longjmp() with noreturn to stop warnings 95extern __attribute__((noreturn)) 96 void longjmp(jmp_buf, int); 97#endif 98 99//#define DEBUG_LOCKS 1 100#ifdef DEBUG_LOCKS 101// The currently executing worker must own this worker's lock 102# define ASSERT_WORKER_LOCK_OWNED(w) \ 103 { \ 104 __cilkrts_worker *tls_worker = __cilkrts_get_tls_worker(); \ 105 CILK_ASSERT((w)->l->lock.owner == tls_worker); \ 106 } 107#else 108# define ASSERT_WORKER_LOCK_OWNED(w) 109#endif // DEBUG_LOCKS 110 111// Options for the scheduler. 112enum schedule_t { SCHEDULE_RUN, 113 SCHEDULE_WAIT, 114 SCHEDULE_EXIT }; 115 116// Return values for provably_good_steal() 117enum provably_good_steal_t 118{ 119 ABANDON_EXECUTION, // Not the last child to the sync - attempt to steal work 120 CONTINUE_EXECUTION, // Last child to the sync - continue executing on this worker 121 WAIT_FOR_CONTINUE // The replay log indicates that this was the worker 122 // which continued. Loop until we are the last worker 123 // to the sync. 124}; 125 126 127// Verify that "w" is the worker we are currently executing on. 128// Because this check is expensive, this method is usually a no-op. 129static inline void verify_current_wkr(__cilkrts_worker *w) 130{ 131#if ((REDPAR_DEBUG >= 3) || (FIBER_DEBUG >= 1)) 132 // Lookup the worker from TLS and compare to w. 133 __cilkrts_worker* tmp = __cilkrts_get_tls_worker(); 134 if (w != tmp) { 135 fprintf(stderr, "Error. W=%d, actual worker =%d...\n", 136 w->self, 137 tmp->self); 138 } 139 CILK_ASSERT(w == tmp); 140#endif 141} 142 143static enum schedule_t worker_runnable(__cilkrts_worker *w); 144 145// Scheduling-fiber functions: 146static void do_return_from_spawn (__cilkrts_worker *w, 147 full_frame *ff, 148 __cilkrts_stack_frame *sf); 149static void do_sync (__cilkrts_worker *w, 150 full_frame *ff, 151 __cilkrts_stack_frame *sf); 152 153// max is defined on Windows and VxWorks 154#if (! defined(_WIN32)) && (! defined(__VXWORKS__)) 155 // TBD: definition of max() for Linux. 156# define max(a, b) ((a) < (b) ? (b) : (a)) 157#endif 158 159void __cilkrts_dump_stats_to_stderr(global_state_t *g) 160{ 161#ifdef CILK_PROFILE 162 int i; 163 for (i = 0; i < g->total_workers; ++i) { 164 // Print out statistics for each worker. We collected them, 165 // so why not print them out? 166 fprintf(stderr, "Stats for worker %d\n", i); 167 dump_stats_to_file(stderr, g->workers[i]->l->stats); 168 __cilkrts_accum_stats(&g->stats, g->workers[i]->l->stats); 169 } 170 171 // Also print out aggregate statistics. 172 dump_stats_to_file(stderr, &g->stats); 173#endif 174 fprintf(stderr, 175 "CILK PLUS Thread Info: P=%d, Q=%d\n", 176 g->P, 177 g->Q); 178 fprintf(stderr, 179 "CILK PLUS RUNTIME MEMORY USAGE: %lld bytes", 180 (long long)g->frame_malloc.allocated_from_os); 181#ifdef CILK_PROFILE 182 if (g->stats.stack_hwm) 183 fprintf(stderr, ", %ld stacks", g->stats.stack_hwm); 184#endif 185 fputc('\n', stderr); 186} 187 188static void validate_worker(__cilkrts_worker *w) 189{ 190 /* check the magic numbers, for debugging purposes */ 191 if (w->l->worker_magic_0 != WORKER_MAGIC_0 || 192 w->l->worker_magic_1 != WORKER_MAGIC_1) 193 abort_because_rts_is_corrupted(); 194} 195 196static void double_link(full_frame *left_ff, full_frame *right_ff) 197{ 198 if (left_ff) 199 left_ff->right_sibling = right_ff; 200 if (right_ff) 201 right_ff->left_sibling = left_ff; 202} 203 204/* add CHILD to the right of all children of PARENT */ 205static void push_child(full_frame *parent_ff, full_frame *child_ff) 206{ 207 double_link(parent_ff->rightmost_child, child_ff); 208 double_link(child_ff, 0); 209 parent_ff->rightmost_child = child_ff; 210} 211 212/* unlink CHILD from the list of all children of PARENT */ 213static void unlink_child(full_frame *parent_ff, full_frame *child_ff) 214{ 215 double_link(child_ff->left_sibling, child_ff->right_sibling); 216 217 if (!child_ff->right_sibling) { 218 /* this is the rightmost child -- update parent link */ 219 CILK_ASSERT(parent_ff->rightmost_child == child_ff); 220 parent_ff->rightmost_child = child_ff->left_sibling; 221 } 222 child_ff->left_sibling = child_ff->right_sibling = 0; /* paranoia */ 223} 224 225static void incjoin(full_frame *ff) 226{ 227 ++ff->join_counter; 228} 229 230static int decjoin(full_frame *ff) 231{ 232 CILK_ASSERT(ff->join_counter > 0); 233 return (--ff->join_counter); 234} 235 236static int simulate_decjoin(full_frame *ff) 237{ 238 CILK_ASSERT(ff->join_counter > 0); 239 return (ff->join_counter - 1); 240} 241 242/* 243 * Pseudo-random generator defined by the congruence S' = 69070 * S 244 * mod (2^32 - 5). Marsaglia (CACM July 1993) says on page 107 that 245 * this is a ``good one''. There you go. 246 * 247 * The literature makes a big fuss about avoiding the division, but 248 * for us it is not worth the hassle. 249 */ 250static const unsigned RNGMOD = ((1ULL << 32) - 5); 251static const unsigned RNGMUL = 69070U; 252 253static unsigned myrand(__cilkrts_worker *w) 254{ 255 unsigned state = w->l->rand_seed; 256 state = (unsigned)((RNGMUL * (unsigned long long)state) % RNGMOD); 257 w->l->rand_seed = state; 258 return state; 259} 260 261static void mysrand(__cilkrts_worker *w, unsigned seed) 262{ 263 seed %= RNGMOD; 264 seed += (seed == 0); /* 0 does not belong to the multiplicative 265 group. Use 1 instead */ 266 w->l->rand_seed = seed; 267} 268 269/* W grabs its own lock */ 270void __cilkrts_worker_lock(__cilkrts_worker *w) 271{ 272 validate_worker(w); 273 CILK_ASSERT(w->l->do_not_steal == 0); 274 275 /* tell thieves to stay out of the way */ 276 w->l->do_not_steal = 1; 277 __cilkrts_fence(); /* probably redundant */ 278 279 __cilkrts_mutex_lock(w, &w->l->lock); 280} 281 282void __cilkrts_worker_unlock(__cilkrts_worker *w) 283{ 284 __cilkrts_mutex_unlock(w, &w->l->lock); 285 CILK_ASSERT(w->l->do_not_steal == 1); 286 /* The fence is probably redundant. Use a release 287 operation when supported (gcc and compatibile); 288 that is faster on x86 which serializes normal stores. */ 289#if defined __GNUC__ && (__GNUC__ * 10 + __GNUC_MINOR__ > 43 || __ICC >= 1110) 290 __sync_lock_release(&w->l->do_not_steal); 291#else 292 w->l->do_not_steal = 0; 293 __cilkrts_fence(); /* store-store barrier, redundant on x86 */ 294#endif 295} 296 297/* try to acquire the lock of some *other* worker */ 298static int worker_trylock_other(__cilkrts_worker *w, 299 __cilkrts_worker *other) 300{ 301 int status = 0; 302 303 validate_worker(other); 304 305 /* This protocol guarantees that, after setting the DO_NOT_STEAL 306 flag, worker W can enter its critical section after waiting for 307 the thief currently in the critical section (if any) and at 308 most one other thief. 309 310 This requirement is overly paranoid, but it should protect us 311 against future nonsense from OS implementors. 312 */ 313 314 /* compete for the right to disturb OTHER */ 315 if (__cilkrts_mutex_trylock(w, &other->l->steal_lock)) { 316 if (other->l->do_not_steal) { 317 /* leave it alone */ 318 } else { 319 status = __cilkrts_mutex_trylock(w, &other->l->lock); 320 } 321 __cilkrts_mutex_unlock(w, &other->l->steal_lock); 322 } 323 324 325 return status; 326} 327 328static void worker_unlock_other(__cilkrts_worker *w, 329 __cilkrts_worker *other) 330{ 331 __cilkrts_mutex_unlock(w, &other->l->lock); 332} 333 334 335/* Lock macro Usage: 336 BEGIN_WITH_WORKER_LOCK(w) { 337 statement; 338 statement; 339 BEGIN_WITH_FRAME_LOCK(w, ff) { 340 statement; 341 statement; 342 } END_WITH_FRAME_LOCK(w, ff); 343 } END_WITH_WORKER_LOCK(w); 344 */ 345#define BEGIN_WITH_WORKER_LOCK(w) __cilkrts_worker_lock(w); do 346#define END_WITH_WORKER_LOCK(w) while (__cilkrts_worker_unlock(w), 0) 347 348// TBD(jsukha): These are worker lock acquistions on 349// a worker whose deque is empty. My conjecture is that we 350// do not need to hold the worker lock at these points. 351// I have left them in for now, however. 352// 353// #define REMOVE_POSSIBLY_OPTIONAL_LOCKS 354#ifdef REMOVE_POSSIBLY_OPTIONAL_LOCKS 355 #define BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) do 356 #define END_WITH_WORKER_LOCK_OPTIONAL(w) while (0) 357#else 358 #define BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) __cilkrts_worker_lock(w); do 359 #define END_WITH_WORKER_LOCK_OPTIONAL(w) while (__cilkrts_worker_unlock(w), 0) 360#endif 361 362 363#define BEGIN_WITH_FRAME_LOCK(w, ff) \ 364 do { full_frame *_locked_ff = ff; __cilkrts_frame_lock(w, _locked_ff); do 365 366#define END_WITH_FRAME_LOCK(w, ff) \ 367 while (__cilkrts_frame_unlock(w, _locked_ff), 0); } while (0) 368 369/* W becomes the owner of F and F can be stolen from W */ 370static void make_runnable(__cilkrts_worker *w, full_frame *ff) 371{ 372 w->l->frame_ff = ff; 373 374 /* CALL_STACK is invalid (the information is stored implicitly in W) */ 375 ff->call_stack = 0; 376} 377 378/* 379 * The worker parameter is unused, except for print-debugging purposes. 380 */ 381static void make_unrunnable(__cilkrts_worker *w, 382 full_frame *ff, 383 __cilkrts_stack_frame *sf, 384 int is_loot, 385 const char *why) 386{ 387 /* CALL_STACK becomes valid again */ 388 ff->call_stack = sf; 389 390 if (sf) { 391#if CILK_LIB_DEBUG 392 if (__builtin_expect(sf->flags & CILK_FRAME_EXITING, 0)) 393 __cilkrts_bug("W%d suspending exiting frame %p/%p\n", w->self, ff, sf); 394#endif 395 sf->flags |= CILK_FRAME_STOLEN | CILK_FRAME_SUSPENDED; 396 sf->worker = 0; 397 398 if (is_loot) 399 __cilkrts_put_stack(ff, sf); 400 401 /* perform any system-dependent action, such as saving the 402 state of the stack */ 403 __cilkrts_make_unrunnable_sysdep(w, ff, sf, is_loot, why); 404 } 405} 406 407 408/* Push the next full frame to be made active in this worker and increment its 409 * join counter. __cilkrts_push_next_frame and pop_next_frame work on a 410 * one-element queue. This queue is used to communicate across the runtime 411 * from the code that wants to activate a frame to the code that can actually 412 * begin execution on that frame. They are asymetrical in that push 413 * increments the join counter but pop does not decrement it. Rather, a 414 * single push/pop combination makes a frame active and increments its join 415 * counter once. */ 416void __cilkrts_push_next_frame(__cilkrts_worker *w, full_frame *ff) 417{ 418 CILK_ASSERT(ff); 419 CILK_ASSERT(!w->l->next_frame_ff); 420 incjoin(ff); 421 w->l->next_frame_ff = ff; 422} 423 424/* Get the next full-frame to be made active in this worker. The join count 425 * of the full frame will have been incremented by the corresponding push 426 * event. See __cilkrts_push_next_frame, above. 427 */ 428static full_frame *pop_next_frame(__cilkrts_worker *w) 429{ 430 full_frame *ff; 431 ff = w->l->next_frame_ff; 432 // Remove the frame from the next_frame field. 433 // 434 // If this is a user worker, then there is a chance that another worker 435 // from our team could push work into our next_frame (if it is the last 436 // worker doing work for this team). The other worker's setting of the 437 // next_frame could race with our setting of next_frame to NULL. This is 438 // the only possible race condition on next_frame. However, if next_frame 439 // has a non-NULL value, then it means the team still has work to do, and 440 // there is no chance of another team member populating next_frame. Thus, 441 // it is safe to set next_frame to NULL, if it was populated. There is no 442 // need for an atomic op. 443 if (NULL != ff) { 444 w->l->next_frame_ff = NULL; 445 } 446 return ff; 447} 448 449/* 450 * Identify the single worker that is allowed to cross a sync in this frame. A 451 * thief should call this function when it is the first to steal work from a 452 * user worker. "First to steal work" may mean that there has been parallelism 453 * in the user worker before, but the whole team sync'd, and this is the first 454 * steal after that. 455 * 456 * This should happen while holding the worker and frame lock. 457 */ 458static void set_sync_master(__cilkrts_worker *w, full_frame *ff) 459{ 460 w->l->last_full_frame = ff; 461 ff->sync_master = w; 462} 463 464/* 465 * The sync that ends all parallelism for a particular user worker is about to 466 * be crossed. Decouple the worker and frame. 467 * 468 * No locks need to be held since the user worker isn't doing anything, and none 469 * of the system workers can steal from it. But unset_sync_master() should be 470 * called before the user worker knows about this work (i.e., before it is 471 * inserted into the w->l->next_frame_ff is set). 472 */ 473static void unset_sync_master(__cilkrts_worker *w, full_frame *ff) 474{ 475 CILK_ASSERT(WORKER_USER == w->l->type); 476 CILK_ASSERT(ff->sync_master == w); 477 ff->sync_master = NULL; 478 w->l->last_full_frame = NULL; 479} 480 481/******************************************************************** 482 * THE protocol: 483 ********************************************************************/ 484/* 485 * This is a protocol for work stealing that minimizes the overhead on 486 * the victim. 487 * 488 * The protocol uses three shared pointers into the worker's deque: 489 * - T - the "tail" 490 * - H - the "head" 491 * - E - the "exception" NB: In this case, "exception" has nothing to do 492 * with C++ throw-catch exceptions -- it refers only to a non-normal return, 493 * i.e., a steal or similar scheduling exception. 494 * 495 * with H <= E, H <= T. 496 * 497 * Stack frames SF, where H <= E < T, are available for stealing. 498 * 499 * The worker operates on the T end of the stack. The frame being 500 * worked on is not on the stack. To make a continuation available for 501 * stealing the worker pushes a from onto the stack: stores *T++ = SF. 502 * To return, it pops the frame off the stack: obtains SF = *--T. 503 * 504 * After decrementing T, the condition E > T signals to the victim that 505 * it should invoke the runtime system's "THE" exception handler. The 506 * pointer E can become INFINITY, in which case the victim must invoke 507 * the THE exception handler as soon as possible. 508 * 509 * See "The implementation of the Cilk-5 multithreaded language", PLDI 1998, 510 * http://portal.acm.org/citation.cfm?doid=277652.277725, for more information 511 * on the THE protocol. 512 */ 513 514/* the infinity value of E */ 515#define EXC_INFINITY ((__cilkrts_stack_frame **) (-1)) 516 517static void increment_E(__cilkrts_worker *victim) 518{ 519 __cilkrts_stack_frame *volatile *tmp; 520 521 // The currently executing worker must own the worker lock to touch 522 // victim->exc 523 ASSERT_WORKER_LOCK_OWNED(victim); 524 525 tmp = victim->exc; 526 if (tmp != EXC_INFINITY) { 527 /* On most x86 this pair of operations would be slightly faster 528 as an atomic exchange due to the implicit memory barrier in 529 an atomic instruction. */ 530 victim->exc = tmp + 1; 531 __cilkrts_fence(); 532 } 533} 534 535static void decrement_E(__cilkrts_worker *victim) 536{ 537 __cilkrts_stack_frame *volatile *tmp; 538 539 // The currently executing worker must own the worker lock to touch 540 // victim->exc 541 ASSERT_WORKER_LOCK_OWNED(victim); 542 543 tmp = victim->exc; 544 if (tmp != EXC_INFINITY) { 545 /* On most x86 this pair of operations would be slightly faster 546 as an atomic exchange due to the implicit memory barrier in 547 an atomic instruction. */ 548 victim->exc = tmp - 1; 549 __cilkrts_fence(); /* memory fence not really necessary */ 550 } 551} 552 553#if 0 554/* for now unused, will be necessary if we implement abort */ 555static void signal_THE_exception(__cilkrts_worker *wparent) 556{ 557 wparent->exc = EXC_INFINITY; 558 __cilkrts_fence(); 559} 560#endif 561 562static void reset_THE_exception(__cilkrts_worker *w) 563{ 564 // The currently executing worker must own the worker lock to touch 565 // w->exc 566 ASSERT_WORKER_LOCK_OWNED(w); 567 568 w->exc = w->head; 569 __cilkrts_fence(); 570} 571 572/* conditions under which victim->head can be stolen: */ 573static int can_steal_from(__cilkrts_worker *victim) 574{ 575 return ((victim->head < victim->tail) && 576 (victim->head < victim->protected_tail)); 577} 578 579/* Return TRUE if the frame can be stolen, false otherwise */ 580static int dekker_protocol(__cilkrts_worker *victim) 581{ 582 // increment_E and decrement_E are going to touch victim->exc. The 583 // currently executing worker must own victim's lock before they can 584 // modify it 585 ASSERT_WORKER_LOCK_OWNED(victim); 586 587 /* ASSERT(E >= H); */ 588 589 increment_E(victim); 590 591 /* ASSERT(E >= H + 1); */ 592 if (can_steal_from(victim)) { 593 /* success, we can steal victim->head and set H <- H + 1 594 in detach() */ 595 return 1; 596 } else { 597 /* failure, restore previous state */ 598 decrement_E(victim); 599 return 0; 600 } 601} 602 603 604/* Link PARENT and CHILD in the spawn tree */ 605static full_frame *make_child(__cilkrts_worker *w, 606 full_frame *parent_ff, 607 __cilkrts_stack_frame *child_sf, 608 cilk_fiber *fiber) 609{ 610 full_frame *child_ff = __cilkrts_make_full_frame(w, child_sf); 611 612 child_ff->parent = parent_ff; 613 push_child(parent_ff, child_ff); 614 615 //DBGPRINTF("%d- make_child - child_frame: %p, parent_frame: %p, child_sf: %p\n" 616 // " parent - parent: %p, left_sibling: %p, right_sibling: %p, rightmost_child: %p\n" 617 // " child - parent: %p, left_sibling: %p, right_sibling: %p, rightmost_child: %p\n", 618 // w->self, child, parent, child_sf, 619 // parent->parent, parent->left_sibling, parent->right_sibling, parent->rightmost_child, 620 // child->parent, child->left_sibling, child->right_sibling, child->rightmost_child); 621 CILK_ASSERT(parent_ff->call_stack); 622 child_ff->is_call_child = (fiber == NULL); 623 624 /* PLACEHOLDER_FIBER is used as non-null marker indicating that 625 child should be treated as a spawn child even though we have not 626 yet assigned a real fiber to its parent. */ 627 if (fiber == PLACEHOLDER_FIBER) 628 fiber = NULL; /* Parent actually gets a null fiber, for now */ 629 630 /* perform any system-dependent actions, such as capturing 631 parameter passing information */ 632 /*__cilkrts_make_child_sysdep(child, parent);*/ 633 634 /* Child gets reducer map and stack of parent. 635 Parent gets a new map and new stack. */ 636 child_ff->fiber_self = parent_ff->fiber_self; 637 child_ff->sync_master = NULL; 638 639 if (child_ff->is_call_child) { 640 /* Cause segfault on any attempted access. The parent gets 641 the child map and stack when the child completes. */ 642 parent_ff->fiber_self = 0; 643 } else { 644 parent_ff->fiber_self = fiber; 645 } 646 647 incjoin(parent_ff); 648 return child_ff; 649} 650 651static inline __cilkrts_stack_frame *__cilkrts_advance_frame(__cilkrts_stack_frame *sf) 652{ 653 __cilkrts_stack_frame *p = sf->call_parent; 654 sf->call_parent = 0; 655 return p; 656} 657 658/* w should be the currently executing worker. 659 * loot_sf is the youngest stack frame in the call stack being 660 * unrolled (i.e., the most deeply nested stack frame.) 661 * 662 * When this method is called for a steal, loot_sf should be on a 663 * victim worker which is different from w. 664 * For CILK_FORCE_REDUCE, the victim worker will equal w. 665 * 666 * Before execution, the __cilkrts_stack_frame's have pointers from 667 * older to younger, i.e., a __cilkrts_stack_frame points to parent. 668 * 669 * This method creates a full frame for each __cilkrts_stack_frame in 670 * the call stack, with each full frame also pointing to its parent. 671 * 672 * The method returns the full frame created for loot_sf, i.e., the 673 * youngest full frame. 674 */ 675static full_frame *unroll_call_stack(__cilkrts_worker *w, 676 full_frame *ff, 677 __cilkrts_stack_frame *const loot_sf) 678{ 679 __cilkrts_stack_frame *sf = loot_sf; 680 __cilkrts_stack_frame *rev_sf = 0; 681 __cilkrts_stack_frame *t_sf; 682 683 CILK_ASSERT(sf); 684 /*CILK_ASSERT(sf->call_parent != sf);*/ 685 686 /* The leafmost frame is unsynched. */ 687 if (sf->worker != w) 688 sf->flags |= CILK_FRAME_UNSYNCHED; 689 690 /* Reverse the call stack to make a linked list ordered from parent 691 to child. sf->call_parent points to the child of SF instead of 692 the parent. */ 693 do { 694 t_sf = (sf->flags & (CILK_FRAME_DETACHED|CILK_FRAME_STOLEN|CILK_FRAME_LAST))? 0 : sf->call_parent; 695 sf->call_parent = rev_sf; 696 rev_sf = sf; 697 sf = t_sf; 698 } while (sf); 699 sf = rev_sf; 700 701 /* Promote each stack frame to a full frame in order from parent 702 to child, following the reversed list we just built. */ 703 make_unrunnable(w, ff, sf, sf == loot_sf, "steal 1"); 704 /* T is the *child* of SF, because we have reversed the list */ 705 for (t_sf = __cilkrts_advance_frame(sf); t_sf; 706 sf = t_sf, t_sf = __cilkrts_advance_frame(sf)) { 707 ff = make_child(w, ff, t_sf, NULL); 708 make_unrunnable(w, ff, t_sf, t_sf == loot_sf, "steal 2"); 709 } 710 711 /* XXX What if the leafmost frame does not contain a sync 712 and this steal is from promote own deque? */ 713 /*sf->flags |= CILK_FRAME_UNSYNCHED;*/ 714 715 CILK_ASSERT(!sf->call_parent); 716 return ff; 717} 718 719/* detach the top of the deque frame from the VICTIM and install a new 720 CHILD frame in its place */ 721static void detach_for_steal(__cilkrts_worker *w, 722 __cilkrts_worker *victim, 723 cilk_fiber* fiber) 724{ 725 /* ASSERT: we own victim->lock */ 726 727 full_frame *parent_ff, *child_ff, *loot_ff; 728 __cilkrts_stack_frame *volatile *h; 729 __cilkrts_stack_frame *sf; 730 731 w->l->team = victim->l->team; 732 733 CILK_ASSERT(w->l->frame_ff == 0 || w == victim); 734 735 h = victim->head; 736 737 CILK_ASSERT(*h); 738 739 victim->head = h + 1; 740 741 parent_ff = victim->l->frame_ff; 742 BEGIN_WITH_FRAME_LOCK(w, parent_ff) { 743 /* parent no longer referenced by victim */ 744 decjoin(parent_ff); 745 746 /* obtain the victim call stack */ 747 sf = *h; 748 749 /* perform system-dependent normalizations */ 750 /*__cilkrts_normalize_call_stack_on_steal(sf);*/ 751 752 /* unroll PARENT_FF with call stack SF, adopt the youngest 753 frame LOOT. If loot_ff == parent_ff, then we hold loot_ff->lock, 754 otherwise, loot_ff is newly created and we can modify it without 755 holding its lock. */ 756 loot_ff = unroll_call_stack(w, parent_ff, sf); 757 758 #if REDPAR_DEBUG >= 3 759 fprintf(stderr, "[W=%d, victim=%d, desc=detach, parent_ff=%p, loot=%p]\n", 760 w->self, victim->self, 761 parent_ff, loot_ff); 762 #endif 763 764 if (WORKER_USER == victim->l->type && 765 NULL == victim->l->last_full_frame) { 766 // Mark this looted frame as special: only the original user worker 767 // may cross the sync. 768 // 769 // This call is a shared access to 770 // victim->l->last_full_frame. 771 set_sync_master(victim, loot_ff); 772 } 773 774 /* LOOT is the next frame that the thief W is supposed to 775 run, unless the thief is stealing from itself, in which 776 case the thief W == VICTIM executes CHILD and nobody 777 executes LOOT. */ 778 if (w == victim) { 779 /* Pretend that frame has been stolen */ 780 loot_ff->call_stack->flags |= CILK_FRAME_UNSYNCHED; 781 loot_ff->simulated_stolen = 1; 782 } 783 else 784 __cilkrts_push_next_frame(w, loot_ff); 785 786 // After this "push_next_frame" call, w now owns loot_ff. 787 child_ff = make_child(w, loot_ff, 0, fiber); 788 789 BEGIN_WITH_FRAME_LOCK(w, child_ff) { 790 /* install child in the victim's work queue, taking 791 the parent_ff's place */ 792 /* child is referenced by victim */ 793 incjoin(child_ff); 794 795 // With this call, w is bestowing ownership of the newly 796 // created frame child_ff to the victim, and victim is 797 // giving up ownership of parent_ff. 798 // 799 // Worker w will either take ownership of parent_ff 800 // if parent_ff == loot_ff, or parent_ff will be 801 // suspended. 802 // 803 // Note that this call changes the victim->frame_ff 804 // while the victim may be executing. 805 make_runnable(victim, child_ff); 806 } END_WITH_FRAME_LOCK(w, child_ff); 807 } END_WITH_FRAME_LOCK(w, parent_ff); 808} 809 810/** 811 * @brief cilk_fiber_proc that resumes user code after a successful 812 * random steal. 813 814 * This function longjmps back into the user code whose state is 815 * stored in cilk_fiber_get_data(fiber)->resume_sf. The stack pointer 816 * is adjusted so that the code resumes on the specified fiber stack 817 * instead of its original stack. 818 * 819 * This method gets executed only on a fiber freshly allocated from a 820 * pool. 821 * 822 * @param fiber The fiber being used to resume user code. 823 * @param arg Unused. 824 */ 825static 826void fiber_proc_to_resume_user_code_for_random_steal(cilk_fiber *fiber) 827{ 828 cilk_fiber_data *data = cilk_fiber_get_data(fiber); 829 __cilkrts_stack_frame* sf = data->resume_sf; 830 full_frame *ff; 831 832 CILK_ASSERT(sf); 833 834 // When we pull the resume_sf out of the fiber to resume it, clear 835 // the old value. 836 data->resume_sf = NULL; 837 CILK_ASSERT(sf->worker == data->owner); 838 ff = sf->worker->l->frame_ff; 839 840 // For Win32, we need to overwrite the default exception handler 841 // in this function, so that when the OS exception handling code 842 // walks off the top of the current Cilk stack, it reaches our stub 843 // handler. 844 845 // Also, this function needs to be wrapped into a try-catch block 846 // so the compiler generates the appropriate exception information 847 // in this frame. 848 849 // TBD: IS THIS HANDLER IN THE WRONG PLACE? Can we longjmp out of 850 // this function (and does it matter?) 851#if defined(_WIN32) && !defined(_WIN64) 852 install_exception_stub_handler(); 853 __try 854#endif 855 { 856 char* new_sp = sysdep_reset_jump_buffers_for_resume(fiber, ff, sf); 857 858 // Notify the Intel tools that we're stealing code 859 ITT_SYNC_ACQUIRED(sf->worker); 860 NOTIFY_ZC_INTRINSIC("cilk_continue", sf); 861 862 // TBD: We'd like to move TBB-interop methods into the fiber 863 // eventually. 864 cilk_fiber_invoke_tbb_stack_op(fiber, CILK_TBB_STACK_ADOPT); 865 866 sf->flags &= ~CILK_FRAME_SUSPENDED; 867 868 // longjmp to user code. Don't process exceptions here, 869 // because we are resuming a stolen frame. 870 sysdep_longjmp_to_sf(new_sp, sf, NULL); 871 /*NOTREACHED*/ 872 // Intel's C compiler respects the preceding lint pragma 873 } 874#if defined(_WIN32) && !defined(_WIN64) 875 __except (CILK_ASSERT(!"should not execute the the stub filter"), 876 EXCEPTION_EXECUTE_HANDLER) 877 { 878 // If we are here, that means something very wrong 879 // has happened in our exception processing... 880 CILK_ASSERT(! "should not be here!"); 881 } 882#endif 883} 884 885static void random_steal(__cilkrts_worker *w) 886{ 887 __cilkrts_worker *victim = NULL; 888 cilk_fiber *fiber = NULL; 889 int n; 890 int success = 0; 891 int32_t victim_id; 892 893 // Nothing's been stolen yet. When true, this will flag 894 // setup_for_execution_pedigree to increment the pedigree 895 w->l->work_stolen = 0; 896 897 /* If the user has disabled stealing (using the debugger) we fail */ 898 if (__builtin_expect(w->g->stealing_disabled, 0)) 899 return; 900 901 CILK_ASSERT(w->l->type == WORKER_SYSTEM || w->l->team == w); 902 903 /* If there is only one processor work can still be stolen. 904 There must be only one worker to prevent stealing. */ 905 CILK_ASSERT(w->g->total_workers > 1); 906 907 /* pick random *other* victim */ 908 n = myrand(w) % (w->g->total_workers - 1); 909 if (n >= w->self) 910 ++n; 911 912 // If we're replaying a log, override the victim. -1 indicates that 913 // we've exhausted the list of things this worker stole when we recorded 914 // the log so just return. If we're not replaying a log, 915 // replay_get_next_recorded_victim() just returns the victim ID passed in. 916 n = replay_get_next_recorded_victim(w, n); 917 if (-1 == n) 918 return; 919 920 victim = w->g->workers[n]; 921 922 START_INTERVAL(w, INTERVAL_FIBER_ALLOCATE) { 923 /* Verify that we can get a stack. If not, no need to continue. */ 924 fiber = cilk_fiber_allocate(&w->l->fiber_pool); 925 } STOP_INTERVAL(w, INTERVAL_FIBER_ALLOCATE); 926 927 928 if (NULL == fiber) { 929#if FIBER_DEBUG >= 2 930 fprintf(stderr, "w=%d: failed steal because we could not get a fiber\n", 931 w->self); 932#endif 933 return; 934 } 935 936 /* do not steal from self */ 937 CILK_ASSERT (victim != w); 938 939 /* Execute a quick check before engaging in the THE protocol. 940 Avoid grabbing locks if there is nothing to steal. */ 941 if (!can_steal_from(victim)) { 942 NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_EMPTYQ); 943 START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) { 944 int ref_count = cilk_fiber_remove_reference(fiber, &w->l->fiber_pool); 945 // Fibers we use when trying to steal should not be active, 946 // and thus should not have any other references. 947 CILK_ASSERT(0 == ref_count); 948 } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE); 949 return; 950 } 951 952 /* Attempt to steal work from the victim */ 953 if (worker_trylock_other(w, victim)) { 954 if (w->l->type == WORKER_USER && victim->l->team != w) { 955 956 // Fail to steal if this is a user worker and the victim is not 957 // on this team. If a user worker were allowed to steal work 958 // descended from another user worker, the former might not be 959 // done with its work by the time it was needed to resume and 960 // unbind. Therefore, user workers are not permitted to change 961 // teams. 962 963 // There is no race on the victim's team because the victim cannot 964 // change its team until it runs out of work to do, at which point 965 // it will try to take out its own lock, and this worker already 966 // holds it. 967 NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_USER_WORKER); 968 969 } else if (victim->l->frame_ff) { 970 // A successful steal will change victim->frame_ff, even 971 // though the victim may be executing. Thus, the lock on 972 // the victim's deque is also protecting victim->frame_ff. 973 if (dekker_protocol(victim)) { 974 int proceed_with_steal = 1; // optimistic 975 976 // If we're replaying a log, verify that this the correct frame 977 // to steal from the victim 978 if (! replay_match_victim_pedigree(w, victim)) 979 { 980 // Abort the steal attempt. decrement_E(victim) to 981 // counter the increment_E(victim) done by the 982 // dekker protocol 983 decrement_E(victim); 984 proceed_with_steal = 0; 985 } 986 987 if (proceed_with_steal) 988 { 989 START_INTERVAL(w, INTERVAL_STEAL_SUCCESS) { 990 success = 1; 991 detach_for_steal(w, victim, fiber); 992 victim_id = victim->self; 993 994 #if REDPAR_DEBUG >= 1 995 fprintf(stderr, "Wkr %d stole from victim %d, fiber = %p\n", 996 w->self, victim->self, fiber); 997 #endif 998 999 // The use of victim->self contradicts our 1000 // classification of the "self" field as 1001 // local. But since this code is only for 1002 // debugging, it is ok. 1003 DBGPRINTF ("%d-%p: Stealing work from worker %d\n" 1004 " sf: %p, call parent: %p\n", 1005 w->self, GetCurrentFiber(), victim->self, 1006 w->l->next_frame_ff->call_stack, 1007 w->l->next_frame_ff->call_stack->call_parent); 1008 } STOP_INTERVAL(w, INTERVAL_STEAL_SUCCESS); 1009 } // end if(proceed_with_steal) 1010 } else { 1011 NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_DEKKER); 1012 } 1013 } else { 1014 NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_EMPTYQ); 1015 } 1016 worker_unlock_other(w, victim); 1017 } else { 1018 NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_LOCK); 1019 } 1020 1021 // Record whether work was stolen. When true, this will flag 1022 // setup_for_execution_pedigree to increment the pedigree 1023 w->l->work_stolen = success; 1024 1025 if (0 == success) { 1026 // failed to steal work. Return the fiber to the pool. 1027 START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) { 1028 int ref_count = cilk_fiber_remove_reference(fiber, &w->l->fiber_pool); 1029 // Fibers we use when trying to steal should not be active, 1030 // and thus should not have any other references. 1031 CILK_ASSERT(0 == ref_count); 1032 } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE); 1033 } 1034 else 1035 { 1036 // Since our steal was successful, finish initialization of 1037 // the fiber. 1038 cilk_fiber_reset_state(fiber, 1039 fiber_proc_to_resume_user_code_for_random_steal); 1040 // Record the pedigree of the frame that w has stolen. 1041 // record only if CILK_RECORD_LOG is set 1042 replay_record_steal(w, victim_id); 1043 } 1044} 1045 1046 1047 1048/** 1049 * At a provably good steal, we need to transfer the child reducer map 1050 * from ff->children_reducer_map into v->reducer_map, where v is the 1051 * worker that resumes execution of ff. 1052 * 1053 * Normally, we have v == w, where w is the currently executing 1054 * worker. In the case where we are resuming a team leader on a user 1055 * worker, however, v might differ from w. 1056 1057 * Thus, this, operation is a no-op, since we can't really move 1058 * ff->children_reducer_map into w here. 1059 * 1060 * Instead, this work is done in setup_for_execution_reducers(). 1061 */ 1062static inline void provably_good_steal_reducers(__cilkrts_worker *w, 1063 full_frame *ff) 1064{ 1065 // No-op. 1066} 1067 1068/* at a provably good steal, incorporate the accumulated exceptions of 1069 children into the parent's exception */ 1070static void provably_good_steal_exceptions(__cilkrts_worker *w, 1071 full_frame *ff) 1072{ 1073 // ASSERT: we own ff->lock 1074 ff->pending_exception = 1075 __cilkrts_merge_pending_exceptions(w, 1076 ff->child_pending_exception, 1077 ff->pending_exception); 1078 ff->child_pending_exception = NULL; 1079} 1080 1081/* At sync discard the frame's old stack and take the leftmost child's. */ 1082static void provably_good_steal_stacks(__cilkrts_worker *w, full_frame *ff) 1083{ 1084 CILK_ASSERT(NULL == ff->fiber_self); 1085 ff->fiber_self = ff->fiber_child; 1086 ff->fiber_child = NULL; 1087} 1088 1089static void __cilkrts_mark_synched(full_frame *ff) 1090{ 1091 ff->call_stack->flags &= ~CILK_FRAME_UNSYNCHED; 1092 ff->simulated_stolen = 0; 1093} 1094 1095static 1096enum provably_good_steal_t provably_good_steal(__cilkrts_worker *w, 1097 full_frame *ff) 1098{ 1099 // ASSERT: we hold w->lock and ff->lock 1100 1101 enum provably_good_steal_t result = ABANDON_EXECUTION; 1102 1103 // If the current replay entry is a sync record matching the worker's 1104 // pedigree, AND this isn't the last child to the sync, return 1105 // WAIT_FOR_CONTINUE to indicate that the caller should loop until 1106 // we find the right frame to steal and CONTINUE_EXECUTION is returned. 1107 int match_found = replay_match_sync_pedigree(w); 1108 if (match_found && (0 != simulate_decjoin(ff))) 1109 return WAIT_FOR_CONTINUE; 1110 1111 START_INTERVAL(w, INTERVAL_PROVABLY_GOOD_STEAL) { 1112 if (decjoin(ff) == 0) { 1113 provably_good_steal_reducers(w, ff); 1114 provably_good_steal_exceptions(w, ff); 1115 provably_good_steal_stacks(w, ff); 1116 __cilkrts_mark_synched(ff); 1117 1118 // If the original owner wants this frame back (to resume 1119 // it on its original thread) pass it back now. 1120 if (NULL != ff->sync_master) { 1121 // The frame wants to go back and be executed by the original 1122 // user thread. We can throw caution to the wind and push the 1123 // frame straight onto its queue because the only way we have 1124 // gotten to this point of being able to continue execution of 1125 // the frame is if the original user worker is spinning without 1126 // work. 1127 1128 unset_sync_master(w->l->team, ff); 1129 __cilkrts_push_next_frame(w->l->team, ff); 1130 1131 // If this is the team leader we're not abandoning the work 1132 if (w == w->l->team) 1133 result = CONTINUE_EXECUTION; 1134 } else { 1135 __cilkrts_push_next_frame(w, ff); 1136 result = CONTINUE_EXECUTION; // Continue working on this thread 1137 } 1138 1139 // The __cilkrts_push_next_frame() call changes ownership 1140 // of ff to the specified worker. 1141 } 1142 } STOP_INTERVAL(w, INTERVAL_PROVABLY_GOOD_STEAL); 1143 1144 // Only write a SYNC record if: 1145 // - We're recording a log *AND* 1146 // - We're the worker continuing from this sync 1147 replay_record_sync(w, result == CONTINUE_EXECUTION); 1148 1149 // If we're replaying a log, and matched a sync from the log, mark the 1150 // sync record seen if the sync isn't going to be abandoned. 1151 replay_advance_from_sync (w, match_found, result == CONTINUE_EXECUTION); 1152 1153 return result; 1154} 1155 1156static void unconditional_steal(__cilkrts_worker *w, 1157 full_frame *ff) 1158{ 1159 // ASSERT: we hold ff->lock 1160 1161 START_INTERVAL(w, INTERVAL_UNCONDITIONAL_STEAL) { 1162 decjoin(ff); 1163 __cilkrts_push_next_frame(w, ff); 1164 } STOP_INTERVAL(w, INTERVAL_UNCONDITIONAL_STEAL); 1165} 1166 1167 1168/* CHILD is about to die. Give its exceptions to a sibling or to the 1169 parent. */ 1170static inline void splice_exceptions_for_call(__cilkrts_worker *w, 1171 full_frame *parent_ff, 1172 full_frame *child_ff) 1173{ 1174 // ASSERT: We own parent_ff->lock 1175 CILK_ASSERT(child_ff->is_call_child); 1176 CILK_ASSERT(NULL == child_ff->right_pending_exception); 1177 CILK_ASSERT(NULL == parent_ff->pending_exception); 1178 1179 parent_ff->pending_exception = child_ff->pending_exception; 1180 child_ff->pending_exception = NULL; 1181} 1182 1183/** 1184 * Merge exceptions for a dying child. 1185 * 1186 * @param w The currently executing worker. 1187 * @param ff The child frame that is dying. 1188 * @param left_exception_ptr Pointer to the exception that is to our left. 1189 */ 1190static inline 1191void splice_exceptions_for_spawn(__cilkrts_worker *w, 1192 full_frame *ff, 1193 struct pending_exception_info **left_exception_ptr) 1194{ 1195 // ASSERT: parent_ff == child_ff->parent. 1196 // ASSERT: We own parent_ff->lock 1197 1198 // Merge current exception into the slot where the left 1199 // exception should go. 1200 *left_exception_ptr = 1201 __cilkrts_merge_pending_exceptions(w, 1202 *left_exception_ptr, 1203 ff->pending_exception); 1204 ff->pending_exception = NULL; 1205 1206 1207 // Merge right exception into the slot where the left exception 1208 // should go. 1209 *left_exception_ptr = 1210 __cilkrts_merge_pending_exceptions(w, 1211 *left_exception_ptr, 1212 ff->right_pending_exception); 1213 ff->right_pending_exception = NULL; 1214} 1215 1216 1217static inline void splice_stacks_for_call(__cilkrts_worker *w, 1218 full_frame *parent_ff, 1219 full_frame *child_ff) 1220{ 1221#if CILK_LIB_DEBUG 1222 if (parent_ff->call_stack) 1223 CILK_ASSERT(!(parent_ff->call_stack->flags & CILK_FRAME_MBZ)); 1224#endif 1225 1226 /* A synched frame does not have accumulated child reducers. */ 1227 CILK_ASSERT(!child_ff->fiber_child); 1228 CILK_ASSERT(child_ff->is_call_child); 1229 1230 /* An attached parent has no self fiber. It may have 1231 accumulated child fibers or child owners, which should be 1232 ignored until sync. */ 1233 CILK_ASSERT(!parent_ff->fiber_self); 1234 parent_ff->fiber_self = child_ff->fiber_self; 1235 child_ff->fiber_self = NULL; 1236} 1237 1238static void finalize_child_for_call(__cilkrts_worker *w, 1239 full_frame *parent_ff, 1240 full_frame *child_ff) 1241{ 1242 // ASSERT: we hold w->lock and parent_ff->lock 1243 1244 START_INTERVAL(w, INTERVAL_FINALIZE_CHILD) { 1245 CILK_ASSERT(child_ff->is_call_child); 1246 CILK_ASSERT(child_ff->join_counter == 0); 1247 CILK_ASSERT(!child_ff->rightmost_child); 1248 CILK_ASSERT(child_ff == parent_ff->rightmost_child); 1249 1250 // CHILD is about to die. 1251 // Splicing out reducers is a no-op for a call since 1252 // w->reducer_map should already store the correct 1253 // reducer map. 1254 1255 // ASSERT there are no maps left to reduce. 1256 CILK_ASSERT(NULL == child_ff->children_reducer_map); 1257 CILK_ASSERT(NULL == child_ff->right_reducer_map); 1258 1259 splice_exceptions_for_call(w, parent_ff, child_ff); 1260 1261 splice_stacks_for_call(w, parent_ff, child_ff); 1262 1263 /* remove CHILD from list of children of PARENT */ 1264 unlink_child(parent_ff, child_ff); 1265 1266 /* continue with the parent. */ 1267 unconditional_steal(w, parent_ff); 1268 __cilkrts_destroy_full_frame(w, child_ff); 1269 } STOP_INTERVAL(w, INTERVAL_FINALIZE_CHILD); 1270} 1271 1272 1273/** 1274 * The invariant on ff->children_reducer_map is that when ff is 1275 * synched and when we are about to resume execution of ff, at least 1276 * one of ff->children_reducer_map and w->reducer_map must be NULL. 1277 * 1278 * Consider the two possibilities before resuming execution of ff: 1279 * 1280 * 1. Suppose ff is synched and suspended. Then either 1281 * 1282 * (a) ff->children_reducer_map stores the reducer map that w 1283 * should use, where w is the worker resuming execution of ff, 1284 * OR 1285 * (b) w already has a user map, and ff->children_reducer_map is NULL. 1286 * 1287 * Case (a) happens when we are resuming execution of ff as a 1288 * provably good steal. In this case, w->reducer_map should be 1289 * NULL and ff->children_reducer_map is valid. To resume 1290 * execution of ff on w, set w->reducer_map to 1291 * ff->children_reducer_map. 1292 * 1293 * Case (b) occurs when we resume execution of ff because ff is a 1294 * called child. Then, ff->children_reducer_map should be NULL, 1295 * and w should already have a valid reducer map when resuming 1296 * execution of ff. We resume execution of ff without changing 1297 * w->reducer_map. 1298 * 1299 * 2. Suppose frame ff is not synched (i.e., it is active and might have 1300 * active children). Then ff->children_reducer_map is the slot for 1301 * storing the reducer map from ff's leftmost child, as in the reducer 1302 * protocol. The runtime may resume execution of ff while it is not 1303 * synched only because of a steal. 1304 * In this case, while we are resuming ff, ff->children_reducer_map 1305 * may be non-NULL (because one of ff's children has completed). 1306 * We resume execution of ff without changing w->reducer_map. 1307 */ 1308static void setup_for_execution_reducers(__cilkrts_worker *w, 1309 full_frame *ff) 1310{ 1311 // We only need to move ff->children_reducer_map into 1312 // w->reducer_map in case 1(a). 1313 // 1314 // First check whether ff is synched. 1315 __cilkrts_stack_frame *sf = ff->call_stack; 1316 if (!(sf->flags & CILK_FRAME_UNSYNCHED)) { 1317 // In this case, ff is synched. (Case 1). 1318 CILK_ASSERT(!ff->rightmost_child); 1319 1320 // Test whether we are in case 1(a) and have 1321 // something to do. Note that if both 1322 // ff->children_reducer_map and w->reducer_map are NULL, we 1323 // can't distinguish between cases 1(a) and 1(b) here. 1324 if (ff->children_reducer_map) { 1325 // We are in Case 1(a). 1326 CILK_ASSERT(!w->reducer_map); 1327 w->reducer_map = ff->children_reducer_map; 1328 ff->children_reducer_map = NULL; 1329 } 1330 } 1331} 1332 1333static void setup_for_execution_exceptions(__cilkrts_worker *w, 1334 full_frame *ff) 1335{ 1336 CILK_ASSERT(NULL == w->l->pending_exception); 1337 w->l->pending_exception = ff->pending_exception; 1338 ff->pending_exception = NULL; 1339} 1340 1341#if 0 /* unused */ 1342static void setup_for_execution_stack(__cilkrts_worker *w, 1343 full_frame *ff) 1344{ 1345} 1346#endif 1347 1348/* 1349 * setup_for_execution_pedigree 1350 * 1351 * Copies the pedigree information from the frame we're resuming to the 1352 * worker. Increments the pedigree if this is work that has been stolen 1353 * to match the increment on a return from a spawn helper. 1354 */ 1355static void setup_for_execution_pedigree(__cilkrts_worker *w) 1356{ 1357 int pedigree_unsynched; 1358 __cilkrts_stack_frame *sf = w->current_stack_frame; 1359 1360 CILK_ASSERT(NULL != sf); 1361 1362 // If this isn't an ABI 1 or later frame, there's no pedigree information 1363 if (0 == CILK_FRAME_VERSION_VALUE(sf->flags)) 1364 return; 1365 1366 // Note whether the pedigree is unsynched and clear the flag before 1367 // we forget 1368 pedigree_unsynched = sf->flags & CILK_FRAME_SF_PEDIGREE_UNSYNCHED; 1369 sf->flags &= ~CILK_FRAME_SF_PEDIGREE_UNSYNCHED; 1370 1371 // If we're just marshalling onto this worker, do not increment 1372 // the rank since that wouldn't happen in a sequential execution 1373 if (w->l->work_stolen || pedigree_unsynched) 1374 { 1375 if (w->l->work_stolen) 1376 w->pedigree.rank = sf->parent_pedigree.rank + 1; 1377 else 1378 w->pedigree.rank = sf->parent_pedigree.rank; 1379 } 1380 1381 w->pedigree.parent = sf->parent_pedigree.parent; 1382 w->l->work_stolen = 0; 1383} 1384 1385static void setup_for_execution(__cilkrts_worker *w, 1386 full_frame *ff, 1387 int is_return_from_call) 1388{ 1389 // ASSERT: We own w->lock and ff->lock || P == 1 1390 1391 setup_for_execution_reducers(w, ff); 1392 setup_for_execution_exceptions(w, ff); 1393 /*setup_for_execution_stack(w, ff);*/ 1394 1395 ff->call_stack->worker = w; 1396 w->current_stack_frame = ff->call_stack; 1397 1398 // If this is a return from a call, leave the pedigree alone 1399 if (! is_return_from_call) 1400 setup_for_execution_pedigree(w); 1401 1402 __cilkrts_setup_for_execution_sysdep(w, ff); 1403 1404 w->head = w->tail = w->l->ltq; 1405 reset_THE_exception(w); 1406 1407 make_runnable(w, ff); 1408} 1409 1410 1411/* 1412 * Called by the scheduling fiber, right before 1413 * resuming a sf/ff for user code. 1414 * 1415 * This method associates the specified sf with the worker. 1416 * 1417 * It also asserts that w, ff, and sf all have the expected properties 1418 * for resuming user code. 1419 */ 1420void scheduling_fiber_prepare_to_resume_user_code(__cilkrts_worker *w, 1421 full_frame *ff, 1422 __cilkrts_stack_frame *sf) 1423{ 1424 w->current_stack_frame = sf; 1425 sf->worker = w; 1426 1427 // Lots of debugging checks on the state of the fiber we might be 1428 // resuming. 1429#if FIBER_DEBUG >= 1 1430# if FIBER_DEBUG >= 3 1431 { 1432 fprintf(stderr, "w=%d: ff=%p, sf=%p. about to resume user code\n", 1433 w->self, ff, sf); 1434 } 1435# endif 1436 1437 const int flags = sf->flags; 1438 CILK_ASSERT(flags & CILK_FRAME_SUSPENDED); 1439 CILK_ASSERT(!sf->call_parent); 1440 CILK_ASSERT(w->head == w->tail); 1441 1442 /* A frame can not be resumed unless it was suspended. */ 1443 CILK_ASSERT(ff->sync_sp != NULL); 1444 1445 /* The leftmost frame has no allocated stack */ 1446 if (ff->simulated_stolen) 1447 CILK_ASSERT(flags & CILK_FRAME_UNSYNCHED); 1448 else if (flags & CILK_FRAME_UNSYNCHED) 1449 /* XXX By coincidence sync_sp could be null. */ 1450 CILK_ASSERT(ff->fiber_self != NULL); 1451 else 1452 /* XXX This frame could be resumed unsynched on the leftmost stack */ 1453 CILK_ASSERT((ff->sync_master == 0 || ff->sync_master == w)); 1454 CILK_ASSERT(w->l->frame_ff == ff); 1455#endif 1456} 1457 1458 1459/** 1460 * This method is the first method that should execute after we've 1461 * switched to a scheduling fiber from user code. 1462 * 1463 * @param fiber The scheduling fiber for the current worker. 1464 * @param wptr The current worker. 1465 */ 1466static void enter_runtime_transition_proc(cilk_fiber *fiber) 1467{ 1468 // We can execute this method for one of three reasons: 1469 // 1. Undo-detach finds parent stolen. 1470 // 2. Sync suspends frame. 1471 // 3. Return from Cilk entry point. 1472 // 1473 // 1474 // In cases 1 and 2, the frame may be truly suspended or 1475 // may be immediately executed by this worker after provably_good_steal. 1476 // 1477 // 1478 // There is a fourth case, which can, but does not need to execute 1479 // this function: 1480 // 4. Starting up the scheduling loop on a user or 1481 // system worker. In this case, we won't have 1482 // a scheduling stack function to run. 1483 __cilkrts_worker* w = cilk_fiber_get_owner(fiber); 1484 if (w->l->post_suspend) { 1485 // Run the continuation function passed to longjmp_into_runtime 1486 run_scheduling_stack_fcn(w); 1487 1488 // After we have jumped into the runtime and run the 1489 // scheduling function, any reducer map the worker had before entering the runtime 1490 // should have already been saved into the appropriate full 1491 // frame. 1492 CILK_ASSERT(NULL == w->reducer_map); 1493 1494 // There shouldn't be any uncaught exceptions. 1495 // 1496 // In Windows, the OS catches any exceptions not caught by the 1497 // user code. Thus, we are omitting the check on Windows. 1498 // 1499 // On Android, calling std::uncaught_exception with the stlport 1500 // library causes a seg fault. Since we're not supporting 1501 // exceptions there at this point, just don't do the check 1502 // 1503 // TBD: Is this check also safe to do on Windows? 1504 CILKBUG_ASSERT_NO_UNCAUGHT_EXCEPTION(); 1505 } 1506} 1507 1508 1509/** 1510 * Method called to jump back to executing user code. 1511 * 1512 * A normal return from the runtime back to resuming user code calls 1513 * this method. A computation executed using force_reduce also calls 1514 * this method to return to user code. 1515 * 1516 * This function should not contain any code that depends on a fiber. 1517 * In a force-reduce case, the user worker may not have a fiber. In 1518 * the force-reduce case, we call this method directly instead of 1519 * calling @c user_code_resume_after_switch_into_runtime. 1520 */ 1521static inline NORETURN 1522cilkrts_resume(__cilkrts_stack_frame *sf, full_frame *ff) 1523{ 1524 // Save the sync stack pointer, and do the bookkeeping 1525 char* sync_sp = ff->sync_sp; 1526 __cilkrts_take_stack(ff, sync_sp); // leaves ff->sync_sp null 1527 1528 sf->flags &= ~CILK_FRAME_SUSPENDED; 1529 // Actually longjmp to the user code. 1530 // We may have exceptions to deal with, since we are resuming 1531 // a previous-suspended frame. 1532 sysdep_longjmp_to_sf(sync_sp, sf, ff); 1533} 1534 1535 1536/** 1537 * Called by the user-code fiber right before resuming a full frame 1538 * (sf/ff). 1539 * 1540 * This method pulls sf/ff out of the worker, and then calls 1541 * cilkrts_resume to jump to user code. 1542 */ 1543static NORETURN 1544user_code_resume_after_switch_into_runtime(cilk_fiber *fiber) 1545{ 1546 __cilkrts_worker *w = cilk_fiber_get_owner(fiber); 1547 __cilkrts_stack_frame *sf; 1548 full_frame *ff; 1549 sf = w->current_stack_frame; 1550 ff = sf->worker->l->frame_ff; 1551 1552#if FIBER_DEBUG >= 1 1553 CILK_ASSERT(ff->fiber_self == fiber); 1554 cilk_fiber_data *fdata = cilk_fiber_get_data(fiber); 1555 DBGPRINTF ("%d-%p: resume_after_switch_into_runtime, fiber=%p\n", 1556 w->self, w, fiber); 1557 CILK_ASSERT(sf == fdata->resume_sf); 1558#endif 1559 1560 // Notify the Intel tools that we're stealing code 1561 ITT_SYNC_ACQUIRED(sf->worker); 1562 NOTIFY_ZC_INTRINSIC("cilk_continue", sf); 1563 cilk_fiber_invoke_tbb_stack_op(fiber, CILK_TBB_STACK_ADOPT); 1564 1565 // Actually jump to user code. 1566 cilkrts_resume(sf, ff); 1567 } 1568 1569 1570/* The current stack is about to either be suspended or destroyed. This 1571 * function will switch to the stack on which the scheduler is suspended and 1572 * resume running the scheduler within function do_work(). Upon waking up, 1573 * the scheduler will run the 'cont' function, using the supplied worker and 1574 * frame. 1575 */ 1576static NORETURN 1577longjmp_into_runtime(__cilkrts_worker *w, 1578 scheduling_stack_fcn_t fcn, 1579 __cilkrts_stack_frame *sf) 1580{ 1581 full_frame *ff, *ff2; 1582 1583 CILK_ASSERT(!w->l->post_suspend); 1584 ff = w->l->frame_ff; 1585 1586 // If we've got only one worker, stealing shouldn't be possible. 1587 // Assume that this is a steal or return from spawn in a force-reduce case. 1588 // We don't have a scheduling stack to switch to, so call the continuation 1589 // function directly. 1590 if (1 == w->g->P) { 1591 fcn(w, ff, sf); 1592 1593 /* The call to function c() will have pushed ff as the next frame. If 1594 * this were a normal (non-forced-reduce) execution, there would have 1595 * been a pop_next_frame call in a separate part of the runtime. We 1596 * must call pop_next_frame here to complete the push/pop cycle. */ 1597 ff2 = pop_next_frame(w); 1598 1599 setup_for_execution(w, ff2, 0); 1600 scheduling_fiber_prepare_to_resume_user_code(w, ff2, w->current_stack_frame); 1601 cilkrts_resume(w->current_stack_frame, ff2); 1602 1603// Suppress clang warning that the expression result is unused 1604#if defined(__clang__) && (! defined(__INTEL_COMPILER)) 1605# pragma clang diagnostic push 1606# pragma clang diagnostic ignored "-Wunused-value" 1607#endif // __clang__ 1608 /* no return */ 1609 CILK_ASSERT(((void)"returned from __cilkrts_resume", 0)); 1610#if defined(__clang__) && (! defined(__INTEL_COMPILER)) 1611# pragma clang diagnostic pop 1612#endif // __clang__ 1613 } 1614 1615 w->l->post_suspend = fcn; 1616 w->l->suspended_stack = sf; 1617 1618 ITT_SYNC_RELEASING(w); 1619 ITT_SYNC_PREPARE(w); 1620 1621#if FIBER_DEBUG >= 2 1622 fprintf(stderr, "ThreadId=%p, W=%d: about to switch into runtime... w->l->frame_ff = %p, sf=%p\n", 1623 cilkos_get_current_thread_id(), 1624 w->self, w->l->frame_ff, 1625 sf); 1626#endif 1627 1628 // Current fiber is either the (1) one we are about to free, 1629 // or (2) it has been passed up to the parent. 1630 cilk_fiber *current_fiber = ( w->l->fiber_to_free ? 1631 w->l->fiber_to_free : 1632 w->l->frame_ff->parent->fiber_child ); 1633 cilk_fiber_data* fdata = cilk_fiber_get_data(current_fiber); 1634 CILK_ASSERT(NULL == w->l->frame_ff->fiber_self); 1635 1636 // Clear the sf in the current fiber for cleanliness, to prevent 1637 // us from accidentally resuming a bad sf. 1638 // Technically, resume_sf gets overwritten for a fiber when 1639 // we are about to resume it anyway. 1640 fdata->resume_sf = NULL; 1641 CILK_ASSERT(fdata->owner == w); 1642 1643 // Set the function to execute immediately after switching to the 1644 // scheduling fiber, but before freeing any fibers. 1645 cilk_fiber_set_post_switch_proc(w->l->scheduling_fiber, 1646 enter_runtime_transition_proc); 1647 cilk_fiber_invoke_tbb_stack_op(current_fiber, CILK_TBB_STACK_ORPHAN); 1648 1649 if (w->l->fiber_to_free) { 1650 // Case 1: we are freeing this fiber. We never 1651 // resume this fiber again after jumping into the runtime. 1652 w->l->fiber_to_free = NULL; 1653 1654 // Extra check. Normally, the fiber we are about to switch to 1655 // should have a NULL owner. 1656 CILK_ASSERT(NULL == cilk_fiber_get_data(w->l->scheduling_fiber)->owner); 1657#if FIBER_DEBUG >= 4 1658 fprintf(stderr, "ThreadId=%p, W=%d: about to switch into runtime.. current_fiber = %p, deallcoate, switch to fiber %p\n", 1659 cilkos_get_current_thread_id(), 1660 w->self, 1661 current_fiber, w->l->scheduling_fiber); 1662#endif 1663 cilk_fiber_invoke_tbb_stack_op(current_fiber, CILK_TBB_STACK_RELEASE); 1664 NOTE_INTERVAL(w, INTERVAL_DEALLOCATE_RESUME_OTHER); 1665 cilk_fiber_remove_reference_from_self_and_resume_other(current_fiber, 1666 &w->l->fiber_pool, 1667 w->l->scheduling_fiber); 1668 // We should never come back here! 1669 CILK_ASSERT(0); 1670 } 1671 else { 1672 // Case 2: We are passing the fiber to our parent because we 1673 // are leftmost. We should come back later to 1674 // resume execution of user code. 1675 // 1676 // If we are not freeing a fiber, there we must be 1677 // returning from a spawn or processing an exception. The 1678 // "sync" path always frees a fiber. 1679 // 1680 // We must be the leftmost child, and by left holder logic, we 1681 // have already moved the current fiber into our parent full 1682 // frame. 1683#if FIBER_DEBUG >= 2 1684 fprintf(stderr, "ThreadId=%p, W=%d: about to suspend self into runtime.. current_fiber = %p, deallcoate, switch to fiber %p\n", 1685 cilkos_get_current_thread_id(), 1686 w->self, 1687 current_fiber, w->l->scheduling_fiber); 1688#endif 1689 1690 NOTE_INTERVAL(w, INTERVAL_SUSPEND_RESUME_OTHER); 1691 1692 cilk_fiber_suspend_self_and_resume_other(current_fiber, 1693 w->l->scheduling_fiber); 1694 // Resuming this fiber returns control back to 1695 // this function because our implementation uses OS fibers. 1696 // 1697 // On Unix, we could have the choice of passing the 1698 // user_code_resume_after_switch_into_runtime as an extra "resume_proc" 1699 // that resumes execution of user code instead of the 1700 // jumping back here, and then jumping back to user code. 1701#if FIBER_DEBUG >= 2 1702 CILK_ASSERT(fdata->owner == __cilkrts_get_tls_worker()); 1703#endif 1704 user_code_resume_after_switch_into_runtime(current_fiber); 1705 } 1706} 1707 1708/* 1709 * Send a message to the children of the specified worker: run or wait. 1710 */ 1711static void notify_children(__cilkrts_worker *w, unsigned int msg) 1712{ 1713 int child_num; 1714 __cilkrts_worker *child; 1715 int num_sys_workers = w->g->P - 1; 1716 1717 // If worker is "n", then its children are 2n + 1, and 2n + 2. 1718 child_num = (w->self << 1) + 1; 1719 if (child_num < num_sys_workers) { 1720 child = w->g->workers[child_num]; 1721 CILK_ASSERT(child->l->signal_node); 1722 signal_node_msg(child->l->signal_node, msg); 1723 child_num++; 1724 if (child_num < num_sys_workers) { 1725 child = w->g->workers[child_num]; 1726 CILK_ASSERT(child->l->signal_node); 1727 signal_node_msg(child->l->signal_node, msg); 1728 } 1729 } 1730} 1731 1732/* 1733 * Notify this worker's children that they need to wait. 1734 */ 1735static void notify_children_wait(__cilkrts_worker *w) 1736{ 1737 notify_children(w, 0); 1738} 1739 1740/* 1741 * Notify this worker's children to run and start trying to steal. 1742 */ 1743static void notify_children_run(__cilkrts_worker *w) 1744{ 1745 notify_children(w, 1); 1746} 1747 1748/** 1749 * A single "check" to find work, either on our queue or through a 1750 * steal attempt. This method checks our local queue once, and 1751 * performs one steal attempt. 1752 */ 1753static full_frame* check_for_work(__cilkrts_worker *w) 1754{ 1755 full_frame *ff = NULL; 1756 ff = pop_next_frame(w); 1757 // If there is no work on the queue, try to steal some. 1758 if (NULL == ff) { 1759 START_INTERVAL(w, INTERVAL_STEALING) { 1760 if (w->l->type != WORKER_USER && w->l->team != NULL) { 1761 // At this point, the worker knows for certain that it has run 1762 // out of work. Therefore, it loses its team affiliation. User 1763 // workers never change teams, of course. 1764 __cilkrts_worker_lock(w); 1765 w->l->team = NULL; 1766 __cilkrts_worker_unlock(w); 1767 } 1768 1769 // If we are about to do a random steal, we should have no 1770 // full frame... 1771 CILK_ASSERT(NULL == w->l->frame_ff); 1772 random_steal(w); 1773 } STOP_INTERVAL(w, INTERVAL_STEALING); 1774 1775 // If the steal was successful, then the worker has populated its next 1776 // frame with the work to resume. 1777 ff = pop_next_frame(w); 1778 if (NULL == ff) { 1779 // Punish the worker for failing to steal. 1780 // No quantum for you! 1781 __cilkrts_yield(); 1782 w->l->steal_failure_count++; 1783 } else { 1784 // Reset steal_failure_count since there is obviously still work to 1785 // be done. 1786 w->l->steal_failure_count = 0; 1787 } 1788 } 1789 return ff; 1790} 1791 1792/** 1793 * Keep stealing or looking on our queue. 1794 * 1795 * Returns either when a full frame is found, or NULL if the 1796 * computation is done. 1797 */ 1798static full_frame* search_until_work_found_or_done(__cilkrts_worker *w) 1799{ 1800 full_frame *ff = NULL; 1801 // Find a full frame to execute (either through random stealing, 1802 // or because we pull it off w's 1-element queue). 1803 while (!ff) { 1804 // Check worker state to figure out our next action. 1805 switch (worker_runnable(w)) 1806 { 1807 case SCHEDULE_RUN: // One attempt at checking for work. 1808 ff = check_for_work(w); 1809 break; 1810 case SCHEDULE_WAIT: // go into wait-mode. 1811 CILK_ASSERT(WORKER_SYSTEM == w->l->type); 1812 // If we are about to wait, then we better not have 1813 // a frame that we should execute... 1814 CILK_ASSERT(NULL == w->l->next_frame_ff); 1815 notify_children_wait(w); 1816 signal_node_wait(w->l->signal_node); 1817 // ... 1818 // Runtime is waking up. 1819 notify_children_run(w); 1820 w->l->steal_failure_count = 0; 1821 break; 1822 case SCHEDULE_EXIT: // exit the scheduler. 1823 CILK_ASSERT(WORKER_USER != w->l->type); 1824 return NULL; 1825 default: 1826 CILK_ASSERT(0); 1827 abort(); 1828 } 1829 } 1830 return ff; 1831} 1832 1833/** 1834 * The proc method for a scheduling fiber on a user worker. 1835 * 1836 * When a user worker jumps into the runtime, it jumps into this 1837 * method by either starting it if the scheduling fiber has never run 1838 * before, or resuming the fiber if it was previously suspended. 1839 */ 1840COMMON_PORTABLE 1841void scheduler_fiber_proc_for_user_worker(cilk_fiber *fiber) 1842{ 1843 __cilkrts_worker* w = cilk_fiber_get_owner(fiber); 1844 CILK_ASSERT(w); 1845 1846 // This must be a user worker 1847 CILK_ASSERT(WORKER_USER == w->l->type); 1848 1849 // If we aren't the current worker, then something is very wrong 1850 // here.. 1851 verify_current_wkr(w); 1852 1853 __cilkrts_run_scheduler_with_exceptions(w); 1854} 1855 1856 1857/** 1858 * The body of the runtime scheduling loop. This function executes in 1859 * 4 stages: 1860 * 1861 * 1. Transitions from the user code into the runtime by 1862 * executing any scheduling-stack functions. 1863 * 2. Looks for a full frame enqueued from a successful provably 1864 * good steal. 1865 * 3. If no full frame is found in step 2, steal until 1866 * a frame is found or we are done. If we are done, finish 1867 * the scheduling loop. 1868 * 4. When a frame is found, setup to resume user code. 1869 * In particular, suspend the current fiber and resume the 1870 * user fiber to execute the frame. 1871 * 1872 * Returns a fiber object that we should switch to after completing 1873 * the body of the loop, or NULL if we should continue executing on 1874 * this fiber. 1875 * 1876 * @pre @c current_fiber should equal @c wptr->l->scheduling_fiber 1877 * 1878 * @param current_fiber The currently executing (scheduling_ fiber 1879 * @param wptr The currently executing worker. 1880 * @param return The next fiber we should switch to. 1881 */ 1882static cilk_fiber* worker_scheduling_loop_body(cilk_fiber* current_fiber, 1883 void* wptr) 1884{ 1885 __cilkrts_worker *w = (__cilkrts_worker*) wptr; 1886 CILK_ASSERT(current_fiber == w->l->scheduling_fiber); 1887 1888 // Stage 1: Transition from executing user code to the runtime code. 1889 // We don't need to do this call here any more, because 1890 // every switch to the scheduling fiber should make this call 1891 // using a post_switch_proc on the fiber. 1892 // 1893 // enter_runtime_transition_proc(w->l->scheduling_fiber, wptr); 1894 1895 // After Stage 1 is complete, w should no longer have 1896 // an associated full frame. 1897 CILK_ASSERT(NULL == w->l->frame_ff); 1898 1899 // Stage 2. First do a quick check of our 1-element queue. 1900 full_frame *ff = pop_next_frame(w); 1901 1902 if (!ff) { 1903 // Stage 3. We didn't find anything from our 1-element 1904 // queue. Now go through the steal loop to find work. 1905 ff = search_until_work_found_or_done(w); 1906 if (!ff) { 1907 CILK_ASSERT(w->g->work_done); 1908 return NULL; 1909 } 1910 } 1911 1912 // Stage 4. Now that we have found a full frame to work on, 1913 // actually execute it. 1914 __cilkrts_stack_frame *sf; 1915 1916 // There shouldn't be any uncaught exceptions. 1917 // 1918 // In Windows, the OS catches any exceptions not caught by the 1919 // user code. Thus, we are omitting the check on Windows. 1920 // 1921 // On Android, calling std::uncaught_exception with the stlport 1922 // library causes a seg fault. Since we're not supporting 1923 // exceptions there at this point, just don't do the check 1924 CILKBUG_ASSERT_NO_UNCAUGHT_EXCEPTION(); 1925 1926 BEGIN_WITH_WORKER_LOCK(w) { 1927 CILK_ASSERT(!w->l->frame_ff); 1928 BEGIN_WITH_FRAME_LOCK(w, ff) { 1929 sf = ff->call_stack; 1930 CILK_ASSERT(sf && !sf->call_parent); 1931 setup_for_execution(w, ff, 0); 1932 } END_WITH_FRAME_LOCK(w, ff); 1933 } END_WITH_WORKER_LOCK(w); 1934 1935 /* run it */ 1936 // 1937 // Prepare to run the full frame. To do so, we need to: 1938 // (a) Execute some code on this fiber (the scheduling 1939 // fiber) to set up data structures, and 1940 // (b) Suspend the scheduling fiber, and resume the 1941 // user-code fiber. 1942 1943 // Part (a). Set up data structures. 1944 scheduling_fiber_prepare_to_resume_user_code(w, ff, sf); 1945 1946 cilk_fiber *other = w->l->frame_ff->fiber_self; 1947 cilk_fiber_data* other_data = cilk_fiber_get_data(other); 1948 cilk_fiber_data* current_fiber_data = cilk_fiber_get_data(current_fiber); 1949 1950 // I believe two cases are possible here, both of which 1951 // should have other_data->resume_sf as NULL. 1952 // 1953 // 1. Resuming a fiber that was previously executing 1954 // user code (i.e., a provably-good-steal). 1955 // In this case, resume_sf should have been 1956 // set to NULL when it was suspended. 1957 // 1958 // 2. Resuming code on a steal. In this case, since we 1959 // grabbed a new fiber, resume_sf should be NULL. 1960 CILK_ASSERT(NULL == other_data->resume_sf); 1961 1962#if FIBER_DEBUG >= 2 1963 fprintf(stderr, "W=%d: other fiber=%p, setting resume_sf to %p\n", 1964 w->self, other, other_data->resume_sf); 1965#endif 1966 // Update our own fiber's data. 1967 current_fiber_data->resume_sf = NULL; 1968 // The scheduling fiber should have the right owner from before. 1969 CILK_ASSERT(current_fiber_data->owner == w); 1970 other_data->resume_sf = sf; 1971 1972 1973#if FIBER_DEBUG >= 3 1974 fprintf(stderr, "ThreadId=%p (about to suspend self resume other), W=%d: current_fiber=%p, other=%p, current_fiber->resume_sf = %p, other->resume_sf = %p\n", 1975 cilkos_get_current_thread_id(), 1976 w->self, 1977 current_fiber, other, 1978 current_fiber_data->resume_sf, 1979 other_data->resume_sf); 1980#endif 1981 return other; 1982} 1983 1984 1985/** 1986 * This function is executed once by each worker, to initialize its 1987 * scheduling loop. 1988 */ 1989static void worker_scheduler_init_function(__cilkrts_worker *w) 1990{ 1991 // First, execute the startup tasks that must happen for all 1992 // worker types. 1993 ITT_SYNC_PREPARE(w); 1994 /* Notify tools about the new worker. Inspector needs this, but we 1995 don't want to confuse Cilkscreen with system threads. User threads 1996 do this notification in bind_thread */ 1997 if (! w->g->under_ptool) 1998 __cilkrts_cilkscreen_establish_worker(w); 1999 2000 // Seed the initial random number generator. 2001 // If we forget to do this, then the worker always steals from 0. 2002 // Programs will still execute correctly, but 2003 // you may see a subtle performance bug... 2004 mysrand(w, (w->self + 1)); 2005 2006 // The startup work varies, depending on the worker type. 2007 switch (w->l->type) { 2008 case WORKER_USER: 2009 // Stop working once we've entered the scheduler. 2010 // For user workers, INTERVAL_IN_SCHEDULER counts the time 2011 // since we called bind_thread. 2012 break; 2013 2014 case WORKER_SYSTEM: 2015 // If a system worker is starting, we must also be starting 2016 // the runtime. 2017 2018 // Runtime begins in a wait-state and is woken up by the first user 2019 // worker when the runtime is ready. 2020 signal_node_wait(w->l->signal_node); 2021 // ... 2022 // Runtime is waking up. 2023 notify_children_run(w); 2024 w->l->steal_failure_count = 0; 2025 2026 // For system threads, count all the time this thread is 2027 // alive in the scheduling loop. 2028 START_INTERVAL(w, INTERVAL_IN_SCHEDULER); 2029 START_INTERVAL(w, INTERVAL_WORKING); 2030 break; 2031 default: 2032 __cilkrts_bug("Unknown worker %p of type %d entering scheduling loop\n", 2033 w, w->l->type); 2034 } 2035} 2036 2037/** 2038 * This function is executed once by each worker, to finish its 2039 * scheduling loop. 2040 * 2041 * @note Currently, only system workers finish their loops. User 2042 * workers will jump away to user code without exiting their 2043 * scheduling loop. 2044 */ 2045static void worker_scheduler_terminate_function(__cilkrts_worker *w) 2046{ 2047 // A user worker should never finish by falling through the 2048 // scheduling loop. 2049 CILK_ASSERT(WORKER_USER != w->l->type); 2050 STOP_INTERVAL(w, INTERVAL_IN_RUNTIME); 2051 STOP_INTERVAL(w, INTERVAL_IN_SCHEDULER); 2052} 2053 2054/** 2055 * The main scheduler function executed by a worker's scheduling 2056 * fiber. 2057 * 2058 * This method is started by either a new system worker, or a user 2059 * worker that has stalled and just been imported into the runtime. 2060 */ 2061static void worker_scheduler_function(__cilkrts_worker *w) 2062{ 2063 worker_scheduler_init_function(w); 2064 2065 // The main scheduling loop body. 2066 2067 while (!w->g->work_done) { 2068 // Set intervals. Now we are in the runtime instead of working. 2069 START_INTERVAL(w, INTERVAL_IN_RUNTIME); 2070 STOP_INTERVAL(w, INTERVAL_WORKING); 2071 2072 // Execute the "body" of the scheduling loop, and figure 2073 // out the fiber to jump to next. 2074 cilk_fiber* fiber_to_resume 2075 = worker_scheduling_loop_body(w->l->scheduling_fiber, w); 2076 2077 if (fiber_to_resume) { 2078 // Suspend the current fiber and resume next one. 2079 NOTE_INTERVAL(w, INTERVAL_SUSPEND_RESUME_OTHER); 2080 STOP_INTERVAL(w, INTERVAL_IN_RUNTIME); 2081 START_INTERVAL(w, INTERVAL_WORKING); 2082 cilk_fiber_suspend_self_and_resume_other(w->l->scheduling_fiber, 2083 fiber_to_resume); 2084 2085 // Return here only when this (scheduling) fiber is 2086 // resumed (i.e., this worker wants to reenter the runtime). 2087 } 2088 } 2089 2090 // Finish the scheduling loop. 2091 worker_scheduler_terminate_function(w); 2092} 2093 2094 2095/************************************************************* 2096 Forward declarations for reduction protocol. 2097*************************************************************/ 2098 2099static __cilkrts_worker* 2100execute_reductions_for_sync(__cilkrts_worker *w, 2101 full_frame *ff, 2102 __cilkrts_stack_frame *sf_at_sync); 2103 2104static __cilkrts_worker* 2105execute_reductions_for_spawn_return(__cilkrts_worker *w, 2106 full_frame *ff, 2107 __cilkrts_stack_frame *returning_sf); 2108 2109 2110 2111/************************************************************* 2112 Scheduler functions that are callable by client code 2113*************************************************************/ 2114static full_frame *disown(__cilkrts_worker *w, 2115 full_frame *ff, 2116 __cilkrts_stack_frame *sf, 2117 const char *why) 2118{ 2119 CILK_ASSERT(ff); 2120 make_unrunnable(w, ff, sf, sf != 0, why); 2121 w->l->frame_ff = 0; 2122 return ff->parent; 2123} 2124 2125/** 2126 * Called when ff is returning from a spawn, and we need to execute a 2127 * reduction. 2128 * 2129 * @param w The currently executing worker. 2130 * @param ff The full frame for w. 2131 * @param returning_sf The stack frame for the spawn helper that is returning. 2132 * 2133 * Normally, by the time we gain control in the runtime, the worker 2134 * has already popped off the __cilkrts_stack_frame "returning_sf" 2135 * from its call chain. 2136 * 2137 * When we have only serial reductions, w->current_stack_frame is not 2138 * needed any more, because w is about to enter the runtime scheduling 2139 * loop anyway. Similarly, the frame "ff" is slated to be destroyed 2140 * after the runtime finishes the return from spawn and splices ff out 2141 * of the tree of full frames. 2142 * 2143 * To execute a parallel reduction, however, we still want 2144 * w->current_stack_frame == returning_sf, and we are going to use the 2145 * frame ff for a little bit longer. 2146 * 2147 * This method: 2148 * 2149 * 1. Puts returning_sf back as w's current stack frame. 2150 * 2. Makes "ff" runnable again on w. 2151 */ 2152static inline 2153void restore_frame_for_spawn_return_reduction(__cilkrts_worker *w, 2154 full_frame *ff, 2155 __cilkrts_stack_frame *returning_sf) { 2156#if REDPAR_DEBUG >= 2 2157 CILK_ASSERT(returning_sf); 2158 CILK_ASSERT(returning_sf->worker == w); 2159#endif 2160 // Change w's current stack frame back to "returning_sf". 2161 // 2162 // Intuitively, w->current_stack_frame should be 2163 // returning_sf->call_parent at this point. 2164 // 2165 // We can not assert this, however, because the pop of 2166 // returning_sf from the call chain has already cleared 2167 // returning_sf->call_parent. We don't want to restore the call 2168 // parent of returning_sf, because its parent has been stolen, and 2169 // the runtime assumes that steals break this link. 2170 2171 // We cannot assert call_parent is NULL either, since that's not true for 2172 // Win64 exception handling 2173// CILK_ASSERT(returning_sf->call_parent == NULL); 2174 w->current_stack_frame = returning_sf; 2175 2176 // Make the full frame "ff" runnable again, in preparation for 2177 // executing the reduction. 2178 make_runnable(w, ff); 2179} 2180 2181 2182NORETURN __cilkrts_c_sync(__cilkrts_worker *w, 2183 __cilkrts_stack_frame *sf_at_sync) 2184{ 2185 full_frame *ff; 2186 2187 // Claim: This read of w->l->frame_ff can occur without 2188 // holding the worker lock because when w has reached a sync 2189 // and entered the runtime (because it stalls), w's deque is empty 2190 // and no one else can steal and change w->l->frame_ff. 2191 2192 ff = w->l->frame_ff; 2193#ifdef _WIN32 2194 __cilkrts_save_exception_state(w, ff); 2195#else 2196 // Move any pending exceptions into the full frame 2197 CILK_ASSERT(NULL == ff->pending_exception); 2198 ff->pending_exception = w->l->pending_exception; 2199 w->l->pending_exception = NULL; 2200#endif 2201 2202 w = execute_reductions_for_sync(w, ff, sf_at_sync); 2203 2204#if FIBER_DEBUG >= 3 2205 fprintf(stderr, "ThreadId=%p, w->self = %d. about to longjmp_into_runtim[c_sync] with ff=%p\n", 2206 cilkos_get_current_thread_id(), w->self, ff); 2207#endif 2208 2209 longjmp_into_runtime(w, do_sync, sf_at_sync); 2210} 2211 2212static void do_sync(__cilkrts_worker *w, full_frame *ff, 2213 __cilkrts_stack_frame *sf) 2214{ 2215 //int abandoned = 1; 2216 enum provably_good_steal_t steal_result = ABANDON_EXECUTION; 2217 2218 START_INTERVAL(w, INTERVAL_SYNC_CHECK) { 2219 BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) { 2220 2221 CILK_ASSERT(ff); 2222 BEGIN_WITH_FRAME_LOCK(w, ff) { 2223 CILK_ASSERT(sf->call_parent == 0); 2224 CILK_ASSERT(sf->flags & CILK_FRAME_UNSYNCHED); 2225 2226 // Before switching into the scheduling fiber, we should have 2227 // already taken care of deallocating the current 2228 // fiber. 2229 CILK_ASSERT(NULL == ff->fiber_self); 2230 2231 // Update the frame's pedigree information if this is an ABI 1 2232 // or later frame 2233 if (CILK_FRAME_VERSION_VALUE(sf->flags) >= 1) 2234 { 2235 sf->parent_pedigree.rank = w->pedigree.rank; 2236 sf->parent_pedigree.parent = w->pedigree.parent; 2237 2238 // Note that the pedigree rank needs to be updated 2239 // when setup_for_execution_pedigree runs 2240 sf->flags |= CILK_FRAME_SF_PEDIGREE_UNSYNCHED; 2241 } 2242 2243 /* the decjoin() occurs in provably_good_steal() */ 2244 steal_result = provably_good_steal(w, ff); 2245 2246 } END_WITH_FRAME_LOCK(w, ff); 2247 // set w->l->frame_ff = NULL after checking abandoned 2248 if (WAIT_FOR_CONTINUE != steal_result) { 2249 w->l->frame_ff = NULL; 2250 } 2251 } END_WITH_WORKER_LOCK_OPTIONAL(w); 2252 } STOP_INTERVAL(w, INTERVAL_SYNC_CHECK); 2253 2254 // Now, if we are in a replay situation and provably_good_steal() returned 2255 // WAIT_FOR_CONTINUE, we should sleep, reacquire locks, call 2256 // provably_good_steal(), and release locks until we get a value other 2257 // than WAIT_FOR_CONTINUE from the function. 2258#ifdef CILK_RECORD_REPLAY 2259 // We don't have to explicitly check for REPLAY_LOG below because 2260 // steal_result can only be set to WAIT_FOR_CONTINUE during replay 2261 while(WAIT_FOR_CONTINUE == steal_result) 2262 { 2263 __cilkrts_sleep(); 2264 BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) 2265 { 2266 ff = w->l->frame_ff; 2267 BEGIN_WITH_FRAME_LOCK(w, ff) 2268 { 2269 steal_result = provably_good_steal(w, ff); 2270 } END_WITH_FRAME_LOCK(w, ff); 2271 if (WAIT_FOR_CONTINUE != steal_result) 2272 w->l->frame_ff = NULL; 2273 } END_WITH_WORKER_LOCK_OPTIONAL(w); 2274 } 2275#endif // CILK_RECORD_REPLAY 2276 2277#ifdef ENABLE_NOTIFY_ZC_INTRINSIC 2278 // If we can't make any further progress on this thread, tell Inspector 2279 // that we're abandoning the work and will go find something else to do. 2280 if (ABANDON_EXECUTION == steal_result) 2281 { 2282 NOTIFY_ZC_INTRINSIC("cilk_sync_abandon", 0); 2283 } 2284#endif // defined ENABLE_NOTIFY_ZC_INTRINSIC 2285 2286 return; /* back to scheduler loop */ 2287} 2288 2289/* worker W completely promotes its own deque, simulating the case 2290 where the whole deque is stolen. We use this mechanism to force 2291 the allocation of new storage for reducers for race-detection 2292 purposes. */ 2293void __cilkrts_promote_own_deque(__cilkrts_worker *w) 2294{ 2295 // Remember the fiber we start this method on. 2296 CILK_ASSERT(w->l->frame_ff); 2297 cilk_fiber* starting_fiber = w->l->frame_ff->fiber_self; 2298 2299 BEGIN_WITH_WORKER_LOCK(w) { 2300 while (dekker_protocol(w)) { 2301 /* PLACEHOLDER_FIBER is used as non-null marker to tell detach() 2302 and make_child() that this frame should be treated as a spawn 2303 parent, even though we have not assigned it a stack. */ 2304 detach_for_steal(w, w, PLACEHOLDER_FIBER); 2305 } 2306 } END_WITH_WORKER_LOCK(w); 2307 2308 2309 // TBD: The management of full frames and fibers is a bit 2310 // sketchy here. We are promoting stack frames into full frames, 2311 // and pretending they are stolen away, but no other worker is 2312 // actually working on them. Some runtime invariants 2313 // may be broken here. 2314 // 2315 // Technically, if we are simulating a steal from w 2316 // w should get a new full frame, but 2317 // keep the same fiber. A real thief would be taking the 2318 // loot frame away, get a new fiber, and starting executing the 2319 // loot frame. 2320 // 2321 // What should a fake thief do? Where does the frame go? 2322 2323 // In any case, we should be finishing the promotion process with 2324 // the same fiber with. 2325 CILK_ASSERT(w->l->frame_ff); 2326 CILK_ASSERT(w->l->frame_ff->fiber_self == starting_fiber); 2327} 2328 2329 2330 2331/* the client code calls this function after a spawn when the dekker 2332 protocol fails. The function may either return or longjmp 2333 into the rts 2334 2335 This function takes in a "returning_sf" argument which corresponds 2336 to the __cilkrts_stack_frame that we are finishing (i.e., the 2337 argument to __cilkrts_leave_frame). 2338 */ 2339void __cilkrts_c_THE_exception_check(__cilkrts_worker *w, 2340 __cilkrts_stack_frame *returning_sf) 2341{ 2342 full_frame *ff; 2343 int stolen_p; 2344 __cilkrts_stack_frame *saved_sf = NULL; 2345 2346 START_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK); 2347 2348 BEGIN_WITH_WORKER_LOCK(w) { 2349 ff = w->l->frame_ff; 2350 CILK_ASSERT(ff); 2351 /* This code is called only upon a normal return and never 2352 upon an exceptional return. Assert that this is the 2353 case. */ 2354 CILK_ASSERT(!w->l->pending_exception); 2355 2356 reset_THE_exception(w); 2357 stolen_p = !(w->head < (w->tail + 1)); /* +1 because tail was 2358 speculatively 2359 decremented by the 2360 compiled code */ 2361 2362 if (stolen_p) { 2363 /* XXX This will be charged to THE for accounting purposes */ 2364 __cilkrts_save_exception_state(w, ff); 2365 2366 // Save the value of the current stack frame. 2367 saved_sf = w->current_stack_frame; 2368 2369 // Reverse the decrement from undo_detach. 2370 // This update effectively resets the deque to be 2371 // empty (i.e., changes w->tail back to equal w->head). 2372 // We need to reset the deque to execute parallel 2373 // reductions. When we have only serial reductions, it 2374 // does not matter, since serial reductions do not 2375 // change the deque. 2376 w->tail++; 2377#if REDPAR_DEBUG > 1 2378 // ASSERT our deque is empty. 2379 CILK_ASSERT(w->head == w->tail); 2380#endif 2381 } 2382 } END_WITH_WORKER_LOCK(w); 2383 2384 STOP_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK); 2385 2386 if (stolen_p) 2387 { 2388 w = execute_reductions_for_spawn_return(w, ff, returning_sf); 2389 2390 // "Mr. Policeman? My parent always told me that if I was in trouble 2391 // I should ask a nice policeman for help. I can't find my parent 2392 // anywhere..." 2393 // 2394 // Write a record to the replay log for an attempt to return to a stolen parent 2395 replay_record_orphaned(w); 2396 2397 // Update the pedigree only after we've finished the 2398 // reductions. 2399 update_pedigree_on_leave_frame(w, returning_sf); 2400 2401 // Notify Inspector that the parent has been stolen and we're 2402 // going to abandon this work and go do something else. This 2403 // will match the cilk_leave_begin in the compiled code 2404 NOTIFY_ZC_INTRINSIC("cilk_leave_stolen", saved_sf); 2405 2406 DBGPRINTF ("%d: longjmp_into_runtime from __cilkrts_c_THE_exception_check\n", w->self); 2407 longjmp_into_runtime(w, do_return_from_spawn, 0); 2408 DBGPRINTF ("%d: returned from longjmp_into_runtime from __cilkrts_c_THE_exception_check?!\n", w->self); 2409 } 2410 else 2411 { 2412 NOTE_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK_USELESS); 2413 return; 2414 } 2415} 2416 2417/* Return an exception to a stolen parent. */ 2418NORETURN __cilkrts_exception_from_spawn(__cilkrts_worker *w, 2419 __cilkrts_stack_frame *returning_sf) 2420{ 2421 full_frame *ff = w->l->frame_ff; 2422 // This is almost the same as THE_exception_check, except 2423 // the detach didn't happen, we don't need to undo the tail 2424 // update. 2425 CILK_ASSERT(w->head == w->tail); 2426 w = execute_reductions_for_spawn_return(w, ff, returning_sf); 2427 2428 longjmp_into_runtime(w, do_return_from_spawn, 0); 2429 CILK_ASSERT(0); 2430} 2431 2432static void do_return_from_spawn(__cilkrts_worker *w, 2433 full_frame *ff, 2434 __cilkrts_stack_frame *sf) 2435{ 2436 full_frame *parent_ff; 2437 enum provably_good_steal_t steal_result = ABANDON_EXECUTION; 2438 2439 BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) { 2440 CILK_ASSERT(ff); 2441 CILK_ASSERT(!ff->is_call_child); 2442 CILK_ASSERT(sf == NULL); 2443 parent_ff = ff->parent; 2444 2445 BEGIN_WITH_FRAME_LOCK(w, ff) { 2446 decjoin(ff); 2447 } END_WITH_FRAME_LOCK(w, ff); 2448 2449 BEGIN_WITH_FRAME_LOCK(w, parent_ff) { 2450 if (parent_ff->simulated_stolen) 2451 unconditional_steal(w, parent_ff); 2452 else 2453 steal_result = provably_good_steal(w, parent_ff); 2454 } END_WITH_FRAME_LOCK(w, parent_ff); 2455 2456 } END_WITH_WORKER_LOCK_OPTIONAL(w); 2457 2458 // Loop here in replay mode 2459#ifdef CILK_RECORD_REPLAY 2460 // We don't have to explicitly check for REPLAY_LOG below because 2461 // steal_result can only get set to WAIT_FOR_CONTINUE during replay. 2462 // We also don't have to worry about the simulated_stolen flag 2463 // because steal_result can only be set to WAIT_FOR_CONTINUE by 2464 // provably_good_steal(). 2465 while(WAIT_FOR_CONTINUE == steal_result) 2466 { 2467 __cilkrts_sleep(); 2468 BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) 2469 { 2470 BEGIN_WITH_FRAME_LOCK(w, parent_ff) 2471 { 2472 steal_result = provably_good_steal(w, parent_ff); 2473 } END_WITH_FRAME_LOCK(w, parent_ff); 2474 } END_WITH_WORKER_LOCK_OPTIONAL(w); 2475 } 2476#endif // CILK_RECORD_REPLAY 2477 2478 // Cleanup the child frame. 2479 __cilkrts_destroy_full_frame(w, ff); 2480 return; 2481} 2482 2483#ifdef _WIN32 2484/* migrate an exception across fibers. Call this function when an exception has 2485 * been thrown and has to traverse across a steal. The exception has already 2486 * been wrapped up, so all that remains is to longjmp() into the continuation, 2487 * sync, and re-raise it. 2488 */ 2489void __cilkrts_migrate_exception(__cilkrts_stack_frame *sf) { 2490 2491 __cilkrts_worker *w = sf->worker; 2492 full_frame *ff; 2493 2494 BEGIN_WITH_WORKER_LOCK(w) { 2495 ff = w->l->frame_ff; 2496 reset_THE_exception(w); 2497 /* there is no need to check for a steal because we wouldn't be here if 2498 there weren't a steal. */ 2499 __cilkrts_save_exception_state(w, ff); 2500 2501 CILK_ASSERT(w->head == w->tail); 2502 } END_WITH_WORKER_LOCK(w); 2503 2504 { 2505 // TBD(jsukha): This function emulates the 2506 // the "do_return_from_spawn" path. 2507 w = execute_reductions_for_spawn_return(w, ff, sf); 2508 } 2509 2510 longjmp_into_runtime(w, do_return_from_spawn, 0); /* does not return. */ 2511 CILK_ASSERT(! "Shouldn't be here..."); 2512} 2513#endif 2514 2515 2516/* Pop a call stack from TAIL. Return the call stack, or NULL if the 2517 queue is empty */ 2518__cilkrts_stack_frame *__cilkrts_pop_tail(__cilkrts_worker *w) 2519{ 2520 __cilkrts_stack_frame *sf; 2521 BEGIN_WITH_WORKER_LOCK(w) { 2522 __cilkrts_stack_frame *volatile *tail = w->tail; 2523 if (w->head < tail) { 2524 --tail; 2525 sf = *tail; 2526 w->tail = tail; 2527 } else { 2528 sf = 0; 2529 } 2530 } END_WITH_WORKER_LOCK(w); 2531 return sf; 2532} 2533 2534#ifdef CILK_RECORD_REPLAY 2535__cilkrts_stack_frame *simulate_pop_tail(__cilkrts_worker *w) 2536{ 2537 __cilkrts_stack_frame *sf; 2538 BEGIN_WITH_WORKER_LOCK(w) { 2539 if (w->head < w->tail) { 2540 sf = *(w->tail-1); 2541 } else { 2542 sf = 0; 2543 } 2544 } END_WITH_WORKER_LOCK(w); 2545 return sf; 2546} 2547#endif 2548 2549 2550/* Return from a call, not a spawn. */ 2551void __cilkrts_return(__cilkrts_worker *w) 2552{ 2553 full_frame *ff, *parent_ff; 2554 START_INTERVAL(w, INTERVAL_RETURNING); 2555 2556 BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) { 2557 ff = w->l->frame_ff; 2558 CILK_ASSERT(ff); 2559 CILK_ASSERT(ff->join_counter == 1); 2560 /* This path is not used to return from spawn. */ 2561 CILK_ASSERT(ff->is_call_child); 2562 2563 BEGIN_WITH_FRAME_LOCK(w, ff) { 2564 // After this call, w->l->frame_ff != ff. 2565 // Technically, w will "own" ff until ff is freed, 2566 // however, because ff is a dying leaf full frame. 2567 parent_ff = disown(w, ff, 0, "return"); 2568 decjoin(ff); 2569 2570#ifdef _WIN32 2571 __cilkrts_save_exception_state(w, ff); 2572#else 2573 // Move the pending exceptions into the full frame 2574 // This should always be NULL if this isn't a 2575 // return with an exception 2576 CILK_ASSERT(NULL == ff->pending_exception); 2577 ff->pending_exception = w->l->pending_exception; 2578 w->l->pending_exception = NULL; 2579#endif // _WIN32 2580 2581 } END_WITH_FRAME_LOCK(w, ff); 2582 2583 __cilkrts_fence(); /* redundant */ 2584 2585 CILK_ASSERT(parent_ff); 2586 2587 BEGIN_WITH_FRAME_LOCK(w, parent_ff) { 2588 finalize_child_for_call(w, parent_ff, ff); 2589 } END_WITH_FRAME_LOCK(w, parent_ff); 2590 2591 ff = pop_next_frame(w); 2592 /* ff will be non-null except when the parent frame is owned 2593 by another worker. 2594 CILK_ASSERT(ff) 2595 */ 2596 CILK_ASSERT(!w->l->frame_ff); 2597 if (ff) { 2598 BEGIN_WITH_FRAME_LOCK(w, ff) { 2599 __cilkrts_stack_frame *sf = ff->call_stack; 2600 CILK_ASSERT(sf && !sf->call_parent); 2601 setup_for_execution(w, ff, 1); 2602 } END_WITH_FRAME_LOCK(w, ff); 2603 } 2604 } END_WITH_WORKER_LOCK_OPTIONAL(w); 2605 2606 STOP_INTERVAL(w, INTERVAL_RETURNING); 2607} 2608 2609static void __cilkrts_unbind_thread() 2610{ 2611 int stop_cilkscreen = 0; 2612 global_state_t *g; 2613 2614 // Take out the global OS mutex to protect accesses to the table of workers 2615 global_os_mutex_lock(); 2616 2617 if (cilkg_is_published()) { 2618 __cilkrts_worker *w = __cilkrts_get_tls_worker(); 2619 if (w) { 2620 g = w->g; 2621 2622 // If there's only 1 worker, the counts will be stopped in 2623 // __cilkrts_scheduler 2624 if (g->P > 1) 2625 { 2626 STOP_INTERVAL(w, INTERVAL_WORKING); 2627 STOP_INTERVAL(w, INTERVAL_IN_SCHEDULER); 2628 } 2629 2630 __cilkrts_set_tls_worker(0); 2631 2632 if (w->self == -1) { 2633 // This worker is an overflow worker. I.e., it was created on- 2634 // demand when the global pool ran out of workers. 2635 destroy_worker(w); 2636 __cilkrts_free(w); 2637 } else { 2638 // This is a normal user worker and needs to be counted by the 2639 // global state for the purposes of throttling system workers. 2640 w->l->type = WORKER_FREE; 2641 __cilkrts_leave_cilk(g); 2642 } 2643 2644 stop_cilkscreen = (0 == g->Q); 2645 } 2646 } 2647 global_os_mutex_unlock(); 2648 2649 /* Turn off Cilkscreen. This needs to be done when we are NOT holding the 2650 * os mutex. */ 2651 if (stop_cilkscreen) 2652 __cilkrts_cilkscreen_disable_instrumentation(); 2653} 2654 2655/* special return from the initial frame */ 2656 2657void __cilkrts_c_return_from_initial(__cilkrts_worker *w) 2658{ 2659 struct cilkred_map *rm; 2660 2661 /* This is only called on a user thread worker. */ 2662 CILK_ASSERT(w->l->type == WORKER_USER); 2663 2664 #if REDPAR_DEBUG >= 3 2665 fprintf(stderr, "[W=%d, desc=cilkrts_c_return_from_initial, ff=%p]\n", 2666 w->self, w->l->frame_ff); 2667 #endif 2668 2669 BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) { 2670 full_frame *ff = w->l->frame_ff; 2671 CILK_ASSERT(ff); 2672 CILK_ASSERT(ff->join_counter == 1); 2673 w->l->frame_ff = 0; 2674 2675 CILK_ASSERT(ff->fiber_self); 2676 // Save any TBB interop data for the next time this thread enters Cilk 2677 cilk_fiber_tbb_interop_save_info_from_stack(ff->fiber_self); 2678 2679 // Deallocate cilk_fiber that mapped to the user stack. The stack 2680 // itself does not get deallocated (of course) but our data 2681 // structure becomes divorced from it. 2682 2683#if FIBER_DEBUG >= 1 2684 fprintf(stderr, "ThreadId=%p: w=%d: We are about to deallocate ff->fiber_self = %p here. w->l->scheduling_fiber = %p. w->l->type = %d\n", 2685 cilkos_get_current_thread_id(), 2686 w->self, 2687 ff->fiber_self, 2688 w->l->scheduling_fiber, 2689 w->l->type); 2690#endif 2691 // The fiber in ff is a user-code fiber. The fiber in 2692 // w->l->scheduling_fiber is a scheduling fiber. These fibers should 2693 // never be equal. When a user worker returns (and will unbind), we 2694 // should destroy only the fiber in ff. The scheduling fiber will be 2695 // re-used. 2696 2697 CILK_ASSERT(ff->fiber_self != w->l->scheduling_fiber); 2698 2699 START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) { 2700 // This fiber might not be deallocated here if there 2701 // is a pending exception on Windows that refers 2702 // to this fiber. 2703 // 2704 // First "suspend" the fiber, and then try to delete it. 2705 cilk_fiber_deallocate_from_thread(ff->fiber_self); 2706 } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE); 2707 ff->fiber_self = NULL; 2708 2709 /* Save reducer map into global_state object */ 2710 rm = w->reducer_map; 2711 w->reducer_map = NULL; 2712 2713#if REDPAR_DEBUG >= 3 2714 fprintf(stderr, "W=%d, reducer_map_to_delete=%p, was in ff=%p\n", 2715 w->self, 2716 rm, 2717 ff); 2718#endif 2719 __cilkrts_destroy_full_frame(w, ff); 2720 2721 2722 /* Work is never done. w->g->work_done = 1; __cilkrts_fence(); */ 2723 } END_WITH_WORKER_LOCK_OPTIONAL(w); 2724 2725 2726 save_pedigree_leaf_from_user_worker(w); 2727 2728 // Workers can have NULL reducer maps now. 2729 if (rm) { 2730 __cilkrts_destroy_reducer_map(w, rm); 2731 } 2732 2733 2734#if FIBER_DEBUG >= 1 2735 __cilkrts_worker* tmp = w; 2736 int tmp_id = w->self; 2737 fprintf(stderr, "w=%d: We are about unbind thread (w= %p)\n", 2738 w->self, 2739 w); 2740#endif 2741 2742 w = NULL; 2743 2744 __cilkrts_unbind_thread(); 2745 2746#if FIBER_DEBUG >= 1 2747 2748 fprintf(stderr, "w=%p, %d: Finished unbind\n", 2749 tmp, tmp_id); 2750#endif 2751 2752 /* Other workers will stop trying to steal if this was the last worker. */ 2753 2754 return; 2755} 2756 2757 2758/* 2759 * __cilkrts_restore_stealing 2760 * 2761 * Restore the protected_tail to a previous state, possibly allowing frames 2762 * to be stolen. The dekker_protocol has been extended to steal only if 2763 * head+1 is < protected_tail. 2764 */ 2765 2766void __cilkrts_restore_stealing( 2767 __cilkrts_worker *w, 2768 __cilkrts_stack_frame *volatile *saved_protected_tail) 2769{ 2770 /* On most x86 this pair of operations would be slightly faster 2771 as an atomic exchange due to the implicit memory barrier in 2772 an atomic instruction. */ 2773 w->protected_tail = saved_protected_tail; 2774 __cilkrts_fence(); 2775} 2776 2777/* 2778 * __cilkrts_disallow_stealing 2779 * 2780 * Move the protected_tail to NEW_PROTECTED_TAIL, preventing any 2781 * frames from being stolen. If NEW_PROTECTED_TAIL is NULL, prevent 2782 * stealing from the whole queue. The dekker_protocol has been 2783 * extended to only steal if head+1 is also < protected_tail. 2784 */ 2785 2786__cilkrts_stack_frame *volatile *__cilkrts_disallow_stealing( 2787 __cilkrts_worker *w, 2788 __cilkrts_stack_frame *volatile *new_protected_tail) 2789{ 2790 __cilkrts_stack_frame *volatile *saved_protected_tail = w->protected_tail; 2791 2792 if (!new_protected_tail) 2793 new_protected_tail = w->l->ltq; 2794 2795 if (w->protected_tail > new_protected_tail) { 2796 w->protected_tail = new_protected_tail; 2797 /* Issue a store-store barrier. The update to protected_tail 2798 here must precede the update to tail in the next spawn. 2799 On x86 this is probably not needed. */ 2800#if defined __GNUC__ && __ICC >= 1200 && !(__MIC__ ||__MIC2__) 2801 _mm_sfence(); 2802#else 2803 __cilkrts_fence(); 2804#endif 2805 } 2806 2807 return saved_protected_tail; 2808} 2809 2810/************************************************************* 2811 Initialization and startup 2812*************************************************************/ 2813 2814__cilkrts_worker *make_worker(global_state_t *g, 2815 int self, __cilkrts_worker *w) 2816{ 2817 w->self = self; 2818 w->g = g; 2819 2820 w->pedigree.rank = 0; // Initial rank is 0 2821 w->pedigree.parent = NULL; 2822 2823 w->l = (local_state *)__cilkrts_malloc(sizeof(*w->l)); 2824 2825 __cilkrts_frame_malloc_per_worker_init(w); 2826 2827 w->reducer_map = NULL; 2828 w->current_stack_frame = NULL; 2829 w->reserved = NULL; 2830 2831 w->l->worker_magic_0 = WORKER_MAGIC_0; 2832 w->l->team = NULL; 2833 w->l->type = WORKER_FREE; 2834 2835 __cilkrts_mutex_init(&w->l->lock); 2836 __cilkrts_mutex_init(&w->l->steal_lock); 2837 w->l->do_not_steal = 0; 2838 w->l->frame_ff = 0; 2839 w->l->next_frame_ff = 0; 2840 w->l->last_full_frame = NULL; 2841 2842 w->l->ltq = (__cilkrts_stack_frame **) 2843 __cilkrts_malloc(g->ltqsize * sizeof(*w->l->ltq)); 2844 w->ltq_limit = w->l->ltq + g->ltqsize; 2845 w->head = w->tail = w->l->ltq; 2846 2847 cilk_fiber_pool_init(&w->l->fiber_pool, 2848 &g->fiber_pool, 2849 g->stack_size, 2850 g->fiber_pool_size, 2851 0, // alloc_max is 0. We don't allocate from the heap directly without checking the parent pool. 2852 0); 2853#if FIBER_DEBUG >= 2 2854 fprintf(stderr, "ThreadId=%p: Making w=%d (%p), pool = %p\n", 2855 cilkos_get_current_thread_id(), 2856 w->self, w, 2857 &w->l->fiber_pool); 2858#endif 2859 w->l->scheduling_fiber = NULL; 2860 w->l->original_pedigree_leaf = NULL; 2861 w->l->rand_seed = 0; /* the scheduler will overwrite this field */ 2862 2863 w->l->post_suspend = 0; 2864 w->l->suspended_stack = 0; 2865 w->l->fiber_to_free = NULL; 2866 w->l->pending_exception = NULL; 2867 2868#if CILK_PROFILE 2869 w->l->stats = __cilkrts_malloc(sizeof(statistics)); 2870 __cilkrts_init_stats(w->l->stats); 2871#else 2872 w->l->stats = NULL; 2873#endif 2874 w->l->steal_failure_count = 0; 2875 2876 w->l->work_stolen = 0; 2877 2878 // Initialize record/replay assuming we're doing neither 2879 w->l->record_replay_fptr = NULL; 2880 w->l->replay_list_root = NULL; 2881 w->l->replay_list_entry = NULL; 2882 w->l->signal_node = NULL; 2883 // Nothing's been stolen yet 2884 w->l->worker_magic_1 = WORKER_MAGIC_1; 2885 2886 /*w->parallelism_disabled = 0;*/ 2887 2888 // Allow stealing all frames. Sets w->saved_protected_tail 2889 __cilkrts_restore_stealing(w, w->ltq_limit); 2890 2891 __cilkrts_init_worker_sysdep(w); 2892 2893 reset_THE_exception(w); 2894 2895 return w; 2896} 2897 2898void destroy_worker(__cilkrts_worker *w) 2899{ 2900 CILK_ASSERT (NULL == w->l->pending_exception); 2901 2902 // Deallocate the scheduling fiber 2903 if (NULL != w->l->scheduling_fiber) 2904 { 2905 // The scheduling fiber is the main fiber for system workers and must 2906 // be deallocated by the thread that created it. Thus, we can 2907 // deallocate only free workers' (formerly user workers) scheduling 2908 // fibers here. 2909 CILK_ASSERT(WORKER_FREE == w->l->type); 2910 2911#if FIBER_DEBUG >=1 2912 fprintf(stderr, "ThreadId=%p, w=%p, %d, deallocating scheduling fiber = %p, \n", 2913 cilkos_get_current_thread_id(), 2914 w, 2915 w->self, 2916 w->l->scheduling_fiber); 2917#endif 2918 int ref_count = cilk_fiber_remove_reference(w->l->scheduling_fiber, NULL); 2919 // Scheduling fiber should never have extra references because of exceptions. 2920 CILK_ASSERT(0 == ref_count); 2921 w->l->scheduling_fiber = NULL; 2922 } 2923 2924#if CILK_PROFILE 2925 if (w->l->stats) { 2926 __cilkrts_free(w->l->stats); 2927 } 2928#else 2929 CILK_ASSERT(NULL == w->l->stats); 2930#endif 2931 2932 /* Free any cached fibers. */ 2933 cilk_fiber_pool_destroy(&w->l->fiber_pool); 2934 2935 __cilkrts_destroy_worker_sysdep(w); 2936 2937 if (w->l->signal_node) { 2938 CILK_ASSERT(WORKER_SYSTEM == w->l->type); 2939 signal_node_destroy(w->l->signal_node); 2940 } 2941 2942 __cilkrts_free(w->l->ltq); 2943 __cilkrts_mutex_destroy(0, &w->l->lock); 2944 __cilkrts_mutex_destroy(0, &w->l->steal_lock); 2945 __cilkrts_frame_malloc_per_worker_cleanup(w); 2946 2947 __cilkrts_free(w->l); 2948 2949 // The caller is responsible for freeing the worker memory 2950} 2951 2952/* 2953 * Make a worker into a system worker. 2954 */ 2955static void make_worker_system(__cilkrts_worker *w) { 2956 CILK_ASSERT(WORKER_FREE == w->l->type); 2957 w->l->type = WORKER_SYSTEM; 2958 w->l->signal_node = signal_node_create(); 2959} 2960 2961void __cilkrts_deinit_internal(global_state_t *g) 2962{ 2963 int i; 2964 __cilkrts_worker *w; 2965 2966 // If there's no global state then we're done 2967 if (NULL == g) 2968 return; 2969 2970#ifdef CILK_PROFILE 2971 __cilkrts_dump_stats_to_stderr(g); 2972#endif 2973 2974 w = g->workers[0]; 2975 if (w->l->frame_ff) { 2976 __cilkrts_destroy_full_frame(w, w->l->frame_ff); 2977 w->l->frame_ff = 0; 2978 } 2979 2980 // Release any resources used for record/replay 2981 replay_term(g); 2982 2983 // Destroy any system dependent global state 2984 __cilkrts_destroy_global_sysdep(g); 2985 2986 for (i = 0; i < g->total_workers; ++i) 2987 destroy_worker(g->workers[i]); 2988 2989 // Free memory for all worker blocks which were allocated contiguously 2990 __cilkrts_free(g->workers[0]); 2991 2992 __cilkrts_free(g->workers); 2993 2994 cilk_fiber_pool_destroy(&g->fiber_pool); 2995 __cilkrts_frame_malloc_global_cleanup(g); 2996 2997 cilkg_deinit_global_state(); 2998} 2999 3000/* 3001 * Wake the runtime by notifying the system workers that they can steal. The 3002 * first user worker into the runtime should call this. 3003 */ 3004static void wake_runtime(global_state_t *g) 3005{ 3006 __cilkrts_worker *root; 3007 if (g->P > 1) { 3008 // Send a message to the root node. The message will propagate. 3009 root = g->workers[0]; 3010 CILK_ASSERT(root->l->signal_node); 3011 signal_node_msg(root->l->signal_node, 1); 3012 } 3013} 3014 3015/* 3016 * Put the runtime to sleep. The last user worker out of the runtime should 3017 * call this. Like Dad always said, turn out the lights when nobody's in the 3018 * room. 3019 */ 3020static void sleep_runtime(global_state_t *g) 3021{ 3022 __cilkrts_worker *root; 3023 if (g->P > 1) { 3024 // Send a message to the root node. The message will propagate. 3025 root = g->workers[0]; 3026 CILK_ASSERT(root->l->signal_node); 3027 signal_node_msg(root->l->signal_node, 0); 3028 } 3029} 3030 3031/* Called when a user thread joins Cilk. 3032 Global lock must be held. */ 3033void __cilkrts_enter_cilk(global_state_t *g) 3034{ 3035 if (g->Q++ == 0) { 3036 // If this is the first user thread to enter Cilk wake 3037 // up all the workers. 3038 wake_runtime(g); 3039 } 3040} 3041 3042/* Called when a user thread leaves Cilk. 3043 Global lock must be held. */ 3044void __cilkrts_leave_cilk(global_state_t *g) 3045{ 3046 if (--g->Q == 0) { 3047 // Put the runtime to sleep. 3048 sleep_runtime(g); 3049 } 3050} 3051 3052/* 3053 * worker_runnable 3054 * 3055 * Return true if the worker should continue to try to steal. False, otherwise. 3056 */ 3057 3058NOINLINE 3059static enum schedule_t worker_runnable(__cilkrts_worker *w) 3060{ 3061 global_state_t *g = w->g; 3062 3063 /* If this worker has something to do, do it. 3064 Otherwise the work would be lost. */ 3065 if (w->l->next_frame_ff) 3066 return SCHEDULE_RUN; 3067 3068 // If Cilk has explicitly (by the user) been told to exit (i.e., by 3069 // __cilkrts_end_cilk() -> __cilkrts_stop_workers(g)), then return 0. 3070 if (g->work_done) 3071 return SCHEDULE_EXIT; 3072 3073 if (0 == w->self) { 3074 // This worker is the root node and is the only one that may query the 3075 // global state to see if there are still any user workers in Cilk. 3076 if (w->l->steal_failure_count > g->max_steal_failures) { 3077 if (signal_node_should_wait(w->l->signal_node)) { 3078 return SCHEDULE_WAIT; 3079 } else { 3080 // Reset the steal_failure_count since we have verified that 3081 // user workers are still in Cilk. 3082 w->l->steal_failure_count = 0; 3083 } 3084 } 3085 } else if (WORKER_SYSTEM == w->l->type && 3086 signal_node_should_wait(w->l->signal_node)) { 3087 // This worker has been notified by its parent that it should stop 3088 // trying to steal. 3089 return SCHEDULE_WAIT; 3090 } 3091 3092 return SCHEDULE_RUN; 3093} 3094 3095 3096 3097// Initialize the worker structs, but don't start the workers themselves. 3098static void init_workers(global_state_t *g) 3099{ 3100 int total_workers = g->total_workers; 3101 int i; 3102 struct CILK_ALIGNAS(256) buffered_worker { 3103 __cilkrts_worker w; 3104 char buf[64]; 3105 } *workers_memory; 3106 3107 /* not needed if only one worker */ 3108 cilk_fiber_pool_init(&g->fiber_pool, 3109 NULL, 3110 g->stack_size, 3111 g->global_fiber_pool_size, // buffer_size 3112 g->max_stacks, // maximum # to allocate 3113 1); 3114 3115 cilk_fiber_pool_set_fiber_limit(&g->fiber_pool, 3116 (g->max_stacks ? g->max_stacks : INT_MAX)); 3117 3118 g->workers = (__cilkrts_worker **) 3119 __cilkrts_malloc(total_workers * sizeof(*g->workers)); 3120 3121 // Allocate 1 block of memory for workers to make life easier for tools 3122 // like Inspector which run multithreaded and need to know the memory 3123 // range for all the workers that will be accessed in a user's program 3124 workers_memory = (struct buffered_worker*) 3125 __cilkrts_malloc(sizeof(*workers_memory) * total_workers); 3126 3127 // Notify any tools that care (Cilkscreen and Inspector) that they should 3128 // ignore memory allocated for the workers 3129 __cilkrts_cilkscreen_ignore_block(&workers_memory[0], 3130 &workers_memory[total_workers]); 3131 3132 // Initialize worker structs, including unused worker slots. 3133 for (i = 0; i < total_workers; ++i) { 3134 g->workers[i] = make_worker(g, i, &workers_memory[i].w); 3135 } 3136 3137 // Set the workers in the first P - 1 slots to be system workers. 3138 // Remaining worker structs already have type == 0. 3139 for (i = 0; i < g->system_workers; ++i) { 3140 make_worker_system(g->workers[i]); 3141 } 3142} 3143 3144void __cilkrts_init_internal(int start) 3145{ 3146 global_state_t *g = NULL; 3147 3148 if (cilkg_is_published()) { 3149 g = cilkg_init_global_state(); 3150 } 3151 else { 3152 3153 // We think the state has not been published yet. 3154 // Grab the lock and try to initialize/publish. 3155 global_os_mutex_lock(); 3156 3157 if (cilkg_is_published()) { 3158 // Some other thread must have snuck in and published. 3159 g = cilkg_init_global_state(); 3160 } 3161 else { 3162 // Initialize and retrieve global state 3163 g = cilkg_init_global_state(); 3164 3165 // Set the scheduler pointer 3166 g->scheduler = worker_scheduler_function; 3167 3168 // If we're running under a sequential P-Tool (Cilkscreen or 3169 // Cilkview) then there's only one worker and we need to tell 3170 // the tool about the extent of the stack 3171 if (g->under_ptool) 3172 __cilkrts_establish_c_stack(); 3173 init_workers(g); 3174 3175 // Initialize per-work record/replay logging 3176 replay_init_workers(g); 3177 3178 // Initialize any system dependent global state 3179 __cilkrts_init_global_sysdep(g); 3180 3181 3182 cilkg_publish_global_state(g); 3183 } 3184 3185 global_os_mutex_unlock(); 3186 } 3187 3188 CILK_ASSERT(g); 3189 3190 if (start && !g->workers_running) 3191 { 3192 // Acquire the global OS mutex while we're starting the workers 3193 global_os_mutex_lock(); 3194 if (!g->workers_running) 3195 // Start P - 1 system workers since P includes the first user 3196 // worker. 3197 __cilkrts_start_workers(g, g->P - 1); 3198 global_os_mutex_unlock(); 3199 } 3200} 3201 3202 3203/************************************************************************ 3204 Methods for reducer protocol. 3205 3206 Reductions occur in two places: 3207 A. A full frame "ff" is returning from a spawn with a stolen parent. 3208 B. A full frame "ff" is stalling at a sync. 3209 3210 To support parallel reductions, reduction functions need to be 3211 executed while control is on a user stack, before jumping into the 3212 runtime. These reductions can not occur while holding a worker or 3213 frame lock. 3214 3215 Before a worker w executes a reduction in either Case A or B, w's 3216 deque is empty. 3217 3218 Since parallel reductions push work onto the deque, we must do extra 3219 work to set up runtime data structures properly before reductions 3220 begin to allow stealing. ( Normally, when we have only serial 3221 reductions, once a worker w starts a reduction, its deque remains 3222 empty until w either steals another frame or resumes a suspended 3223 frame. Thus, we don't care about the state of the deque, since w 3224 will reset its deque when setting up execution of a frame. ) 3225 3226 To allow for parallel reductions, we coerce the runtime data 3227 structures so that, from their perspective, it looks as though we 3228 have spliced in an "execute_reductions()" function. Consider the 3229 two cases for reductions: 3230 3231 Case A: Return from a spawn with a stolen parent. 3232 Consider a spawned function g is returning on a worker w. 3233 Assume: 3234 - g was spawned from a parent function f. 3235 - ff is the full frame for g's spawn helper 3236 - sf be the __cilkrts_stack_frame for g's spawn helper. 3237 3238 We are conceptually splicing "execute_reductions()" so that it 3239 occurs immediately before the spawn helper of g returns to f. 3240 3241 We do so by creating two different world views --- one for the 3242 runtime data structures, and one for the actual control flow. 3243 3244 - Before reductions begin, the runtime data structures should 3245 look as though the spawn helper of g is calling 3246 "execute_reductions()", in terms of both the user stack and 3247 worker deque. More precisely, w should satisfy the 3248 following properties: 3249 3250 (a) w has ff as its full frame, 3251 (b) w has sf as its __cilkrts_stack_frame, and 3252 (c) w has an empty deque. 3253 3254 If the runtime satisfies these properties, then if w 3255 encounters a spawn in a parallel reduction, it can push onto 3256 a valid deque. Also, when a steal from w occurs, it will 3257 build the correct tree of full frames when w is stolen from. 3258 3259 - In actual control flow, however, once the 3260 "execute_reductions()" function returns, it is actually 3261 returning to runtime code instead of g's spawn helper. 3262 3263 At the point a worker w began executing reductions, the 3264 control flow / compiled code had already finished g's spawn 3265 helper, and w was about to enter the runtime. With parallel 3266 reductions, some worker v (which might be different from w) 3267 is the one returning to the runtime. 3268 3269 3270 The reduction logic consists of 4 steps: 3271 3272 A1. Restore runtime data structures to make it look as though 3273 the spawn helper of g() is still the currently executing 3274 frame for w. 3275 3276 A2. Execute reductions on the user stack. Reductions also 3277 includes the logic for exceptions and stacks. Note that 3278 reductions start on w, but may finish on a different 3279 worker if there is parallelism in the reduce. 3280 3281 A3. Splice out ff from the tree of full frames. 3282 3283 A4. Jump into the runtime/scheduling stack and execute 3284 "do_return_from_spawn". This method 3285 3286 (a) Frees the user stack we were just on if it is no longer needed. 3287 (b) Decrement the join counter on ff->parent, and tries to do a 3288 provably good steal. 3289 (c) Clean up the full frame ff. 3290 3291 3292 Case B: Stalling at a sync. 3293 3294 Consider a function g(), with full frame ff and 3295 __cilkrts_stack_frame sf. Suppose g() stalls at a sync, and we 3296 are executing reductions. 3297 3298 Conceptually, we are splicing in an "execute_reductions()" 3299 function into g() as the last action that g() takes immediately 3300 before it executes the cilk_sync. 3301 3302 The reduction logic for this case is similar to Case A. 3303 3304 B1. Restore the runtime data structures. 3305 3306 The main difference from Case A is that ff/sf is still a 3307 frame that needs to be executed later (since it is stalling 3308 at a cilk_sync). Thus, we also need to save the current 3309 stack information into "ff" so that we can correctly resume 3310 execution of "ff" after the sync. 3311 3312 B2. Execute reductions on the user stack. 3313 3314 B3. No frame to splice out of the tree. 3315 3316 B4. Jump into the runtime/scheduling stack and execute "do_sync". 3317 This method: 3318 (a) Frees the user stack we were just on if it is no longer needed. 3319 (b) Tries to execute a provably good steal. 3320 3321 Finally, for the reducer protocol, we consider two reduction paths, 3322 namely a "fast" and "slow" path. On a fast path, only trivial 3323 merges of reducer maps happen (i.e., one or both of the maps are 3324 NULL). Otherwise, on the slow path, a reduction actually needs to 3325 happen. 3326 3327*****************************************************************/ 3328 3329/** 3330 * @brief Locations to store the result of a reduction. 3331 * 3332 * Struct storing pointers to the fields in our "left" sibling that we 3333 * should update when splicing out a full frame or stalling at a sync. 3334 */ 3335typedef struct { 3336 /** A pointer to the location of our left reducer map. */ 3337 struct cilkred_map **map_ptr; 3338 3339 /** A pointer to the location of our left exception. */ 3340 struct pending_exception_info **exception_ptr; 3341} splice_left_ptrs; 3342 3343/** 3344 * For a full frame returning from a spawn, calculate the pointers to 3345 * the maps and exceptions to my left. 3346 * 3347 * @param w The currently executing worker. 3348 * @param ff Full frame that is dying 3349 * @return Pointers to our "left" for reducers and exceptions. 3350 */ 3351static inline 3352splice_left_ptrs compute_left_ptrs_for_spawn_return(__cilkrts_worker *w, 3353 full_frame *ff) 3354{ 3355 // ASSERT: we hold the lock on ff->parent 3356 3357 splice_left_ptrs left_ptrs; 3358 if (ff->left_sibling) { 3359 left_ptrs.map_ptr = &ff->left_sibling->right_reducer_map; 3360 left_ptrs.exception_ptr = &ff->left_sibling->right_pending_exception; 3361 } 3362 else { 3363 full_frame *parent_ff = ff->parent; 3364 left_ptrs.map_ptr = &parent_ff->children_reducer_map; 3365 left_ptrs.exception_ptr = &parent_ff->child_pending_exception; 3366 } 3367 return left_ptrs; 3368} 3369 3370/** 3371 * For a full frame at a sync, calculate the pointers to the maps and 3372 * exceptions to my left. 3373 * 3374 * @param w The currently executing worker. 3375 * @param ff Full frame that is stalling at a sync. 3376 * @return Pointers to our "left" for reducers and exceptions. 3377 */ 3378static inline 3379splice_left_ptrs compute_left_ptrs_for_sync(__cilkrts_worker *w, 3380 full_frame *ff) 3381{ 3382 // ASSERT: we hold the lock on ff 3383 splice_left_ptrs left_ptrs; 3384 3385 // Figure out which map to the left we should merge into. 3386 if (ff->rightmost_child) { 3387 CILK_ASSERT(ff->rightmost_child->parent == ff); 3388 left_ptrs.map_ptr = &(ff->rightmost_child->right_reducer_map); 3389 left_ptrs.exception_ptr = &(ff->rightmost_child->right_pending_exception); 3390 } 3391 else { 3392 // We have no children. Then, we should be the last 3393 // worker at the sync... "left" is our child map. 3394 left_ptrs.map_ptr = &(ff->children_reducer_map); 3395 left_ptrs.exception_ptr = &(ff->child_pending_exception); 3396 } 3397 return left_ptrs; 3398} 3399 3400/** 3401 * After we have completed all reductions on a spawn return, call this 3402 * method to finish up before jumping into the runtime. 3403 * 3404 * 1. Perform the "reduction" on stacks, i.e., execute the left 3405 * holder logic to pass the leftmost stack up. 3406 * 3407 * w->l->fiber_to_free holds any stack that needs to be freed 3408 * when control switches into the runtime fiber. 3409 * 3410 * 2. Unlink and remove child_ff from the tree of full frames. 3411 * 3412 * @param w The currently executing worker. 3413 * @param parent_ff The parent of child_ff. 3414 * @param child_ff The full frame returning from a spawn. 3415 */ 3416static inline 3417void finish_spawn_return_on_user_stack(__cilkrts_worker *w, 3418 full_frame *parent_ff, 3419 full_frame *child_ff) 3420{ 3421 CILK_ASSERT(w->l->fiber_to_free == NULL); 3422 3423 // Execute left-holder logic for stacks. 3424 if (child_ff->left_sibling || parent_ff->fiber_child) { 3425 // Case where we are not the leftmost stack. 3426 CILK_ASSERT(parent_ff->fiber_child != child_ff->fiber_self); 3427 3428 // Remember any fiber we need to free in the worker. 3429 // After we jump into the runtime, we will actually do the 3430 // free. 3431 w->l->fiber_to_free = child_ff->fiber_self; 3432 } 3433 else { 3434 // We are leftmost, pass stack/fiber up to parent. 3435 // Thus, no stack/fiber to free. 3436 parent_ff->fiber_child = child_ff->fiber_self; 3437 w->l->fiber_to_free = NULL; 3438 } 3439 3440 child_ff->fiber_self = NULL; 3441 3442 unlink_child(parent_ff, child_ff); 3443} 3444 3445 3446/** 3447 * Executes any fast reductions necessary to splice ff out of the tree 3448 * of full frames. 3449 * 3450 * This "fast" path performs only trivial merges of reducer maps, 3451 * i.e,. when one of them is NULL. 3452 * (See slow_path_reductions_for_spawn_return() for slow path.) 3453 * 3454 * Returns: 1 if we finished all reductions. 3455 * Returns: 0 if there are still reductions to execute, and 3456 * we should execute the slow path. 3457 * 3458 * This method assumes w holds the frame lock on parent_ff. 3459 * After this method completes: 3460 * 1. We have spliced ff out of the tree of full frames. 3461 * 2. The reducer maps of child_ff have been deposited 3462 * "left" according to the reducer protocol. 3463 * 3. w->l->stack_to_free stores the stack 3464 * that needs to be freed once we jump into the runtime. 3465 * 3466 * We have not, however, decremented the join counter on ff->parent. 3467 * This prevents any other workers from resuming execution of the parent. 3468 * 3469 * @param w The currently executing worker. 3470 * @param ff The full frame returning from a spawn. 3471 * @return NULL if we finished all reductions. 3472 * @return The address where the left map is stored (which should be passed to 3473 * slow_path_reductions_for_spawn_return()) if there are 3474 * still reductions to execute. 3475 */ 3476struct cilkred_map** 3477fast_path_reductions_for_spawn_return(__cilkrts_worker *w, 3478 full_frame *ff) 3479{ 3480 // ASSERT: we hold ff->parent->lock. 3481 splice_left_ptrs left_ptrs; 3482 3483 CILK_ASSERT(NULL == w->l->pending_exception); 3484 3485 // Figure out the pointers to the left where I want 3486 // to put reducers and exceptions. 3487 left_ptrs = compute_left_ptrs_for_spawn_return(w, ff); 3488 3489 // Go ahead and merge exceptions while holding the lock. 3490 splice_exceptions_for_spawn(w, ff, left_ptrs.exception_ptr); 3491 3492 // Now check if we have any reductions to perform. 3493 // 3494 // Consider all the cases of left, middle and right maps. 3495 // 0. (-, -, -) : finish and return 1 3496 // 1. (L, -, -) : finish and return 1 3497 // 2. (-, M, -) : slide over to left, finish, and return 1. 3498 // 3. (L, M, -) : return 0 3499 // 4. (-, -, R) : slide over to left, finish, and return 1. 3500 // 5. (L, -, R) : return 0 3501 // 6. (-, M, R) : return 0 3502 // 7. (L, M, R) : return 0 3503 // 3504 // In terms of code: 3505 // L == *left_ptrs.map_ptr 3506 // M == w->reducer_map 3507 // R == f->right_reducer_map. 3508 // 3509 // The goal of the code below is to execute the fast path with 3510 // as few branches and writes as possible. 3511 3512 int case_value = (*(left_ptrs.map_ptr) != NULL); 3513 case_value += ((w->reducer_map != NULL) << 1); 3514 case_value += ((ff->right_reducer_map != NULL) << 2); 3515 3516 // Fastest path is case_value == 0 or 1. 3517 if (case_value >=2) { 3518 switch (case_value) { 3519 case 2: 3520 *(left_ptrs.map_ptr) = w->reducer_map; 3521 w->reducer_map = NULL; 3522 return NULL; 3523 break; 3524 case 4: 3525 *(left_ptrs.map_ptr) = ff->right_reducer_map; 3526 ff->right_reducer_map = NULL; 3527 return NULL; 3528 default: 3529 // If we have to execute the slow path, then 3530 // return the pointer to the place to deposit the left 3531 // map. 3532 return left_ptrs.map_ptr; 3533 } 3534 } 3535 3536 // Do nothing 3537 return NULL; 3538} 3539 3540 3541/** 3542 * Executes any reductions necessary to splice "ff" frame out of 3543 * the steal tree. 3544 * 3545 * This method executes the "slow" path for reductions on a spawn 3546 * return, i.e., there are non-NULL maps that need to be merged 3547 * together. 3548 * 3549 * This method should execute only if 3550 * fast_path_reductions_for_spawn_return() returns a non-NULL 3551 * left_map_ptr. 3552 * 3553 * Upon entry, left_map_ptr should be the location of the left map 3554 * at the start of the reduction, as calculated by 3555 * fast_path_reductions_for_spawn_return(). 3556 * 3557 * After this method completes: 3558 * 1. We have spliced ff out of the tree of full frames. 3559 * 2. The reducer maps of child_ff have been deposited 3560 * "left" according to the reducer protocol. 3561 * 3. w->l->stack_to_free stores the stack 3562 * that needs to be freed once we jump into the runtime. 3563 * We have not, however, decremented the join counter on ff->parent, 3564 * so no one can resume execution of the parent yet. 3565 * 3566 * WARNING: 3567 * This method assumes the lock on ff->parent is held upon entry, and 3568 * Upon exit, the worker that returns still holds a lock on ff->parent 3569 * This method can, however, release and reacquire the lock on ff->parent. 3570 * 3571 * @param w The currently executing worker. 3572 * @param ff The full frame returning from a spawn. 3573 * @param left_map_ptr Pointer to our initial left map. 3574 * @return The worker that this method returns on. 3575 */ 3576static __cilkrts_worker* 3577slow_path_reductions_for_spawn_return(__cilkrts_worker *w, 3578 full_frame *ff, 3579 struct cilkred_map **left_map_ptr) 3580{ 3581 3582 // CILK_ASSERT: w is holding frame lock on parent_ff. 3583#if REDPAR_DEBUG > 0 3584 CILK_ASSERT(!ff->rightmost_child); 3585 CILK_ASSERT(!ff->is_call_child); 3586#endif 3587 3588 // Loop invariant: 3589 // When beginning this loop, we should 3590 // 1. Be holding the lock on ff->parent. 3591 // 2. left_map_ptr should be the address of the pointer to the left map. 3592 // 3. All maps should be slid over left by one, if possible. 3593 // 4. All exceptions should be merged so far. 3594 while (1) { 3595 3596 // Slide middle map left if possible. 3597 if (!(*left_map_ptr)) { 3598 *left_map_ptr = w->reducer_map; 3599 w->reducer_map = NULL; 3600 } 3601 // Slide right map to middle if possible. 3602 if (!w->reducer_map) { 3603 w->reducer_map = ff->right_reducer_map; 3604 ff->right_reducer_map = NULL; 3605 } 3606 3607 // Since we slid everything left by one, 3608 // we are finished if there is no middle map. 3609 if (!w->reducer_map) { 3610 verify_current_wkr(w); 3611 return w; 3612 } 3613 else { 3614 struct cilkred_map* left_map; 3615 struct cilkred_map* middle_map; 3616 struct cilkred_map* right_map; 3617 3618 // Take all the maps from their respective locations. 3619 // We can't leave them in place and execute a reduction because these fields 3620 // might change once we release the lock. 3621 left_map = *left_map_ptr; 3622 *left_map_ptr = NULL; 3623 middle_map = w->reducer_map; 3624 w->reducer_map = NULL; 3625 right_map = ff->right_reducer_map; 3626 ff->right_reducer_map = NULL; 3627 3628 // WARNING!!! Lock release here. 3629 // We have reductions to execute (and we can't hold locks). 3630 __cilkrts_frame_unlock(w, ff->parent); 3631 3632 // Merge all reducers into the left map. 3633 left_map = repeated_merge_reducer_maps(&w, 3634 left_map, 3635 middle_map); 3636 verify_current_wkr(w); 3637 left_map = repeated_merge_reducer_maps(&w, 3638 left_map, 3639 right_map); 3640 verify_current_wkr(w); 3641 CILK_ASSERT(NULL == w->reducer_map); 3642 // Put the final answer back into w->reducer_map. 3643 w->reducer_map = left_map; 3644 3645 // Save any exceptions generated because of the reduction 3646 // process from the returning worker. These get merged 3647 // the next time around the loop. 3648 CILK_ASSERT(NULL == ff->pending_exception); 3649 ff->pending_exception = w->l->pending_exception; 3650 w->l->pending_exception = NULL; 3651 3652 // Lock ff->parent for the next loop around. 3653 __cilkrts_frame_lock(w, ff->parent); 3654 3655 // Once we have the lock again, recompute who is to our 3656 // left. 3657 splice_left_ptrs left_ptrs; 3658 left_ptrs = compute_left_ptrs_for_spawn_return(w, ff); 3659 3660 // Update the pointer for the left map. 3661 left_map_ptr = left_ptrs.map_ptr; 3662 // Splice the exceptions for spawn. 3663 splice_exceptions_for_spawn(w, ff, left_ptrs.exception_ptr); 3664 } 3665 } 3666 // We should never break out of this loop. 3667 3668 CILK_ASSERT(0); 3669 return NULL; 3670} 3671 3672 3673 3674/** 3675 * Execute reductions when returning from a spawn whose parent has 3676 * been stolen. 3677 * 3678 * Execution may start on w, but may finish on a different worker. 3679 * This method acquires/releases the lock on ff->parent. 3680 * 3681 * @param w The currently executing worker. 3682 * @param ff The full frame of the spawned function that is returning. 3683 * @param returning_sf The __cilkrts_stack_frame for this returning function. 3684 * @return The worker returning from this method. 3685 */ 3686static __cilkrts_worker* 3687execute_reductions_for_spawn_return(__cilkrts_worker *w, 3688 full_frame *ff, 3689 __cilkrts_stack_frame *returning_sf) 3690{ 3691 // Step A1 from reducer protocol described above. 3692 // 3693 // Coerce the runtime into thinking that 3694 // ff/returning_sf are still on the bottom of 3695 // w's deque. 3696 restore_frame_for_spawn_return_reduction(w, ff, returning_sf); 3697 3698 // Step A2 and A3: Execute reductions on user stack. 3699 BEGIN_WITH_FRAME_LOCK(w, ff->parent) { 3700 struct cilkred_map **left_map_ptr; 3701 left_map_ptr = fast_path_reductions_for_spawn_return(w, ff); 3702 3703 // Pointer will be non-NULL if there are 3704 // still reductions to execute. 3705 if (left_map_ptr) { 3706 // WARNING: This method call may release the lock 3707 // on ff->parent and re-acquire it (possibly on a 3708 // different worker). 3709 // We can't hold locks while actually executing 3710 // reduce functions. 3711 w = slow_path_reductions_for_spawn_return(w, 3712 ff, 3713 left_map_ptr); 3714 verify_current_wkr(w); 3715 } 3716 3717 finish_spawn_return_on_user_stack(w, ff->parent, ff); 3718 // WARNING: the use of this lock macro is deceptive. 3719 // The worker may have changed here. 3720 } END_WITH_FRAME_LOCK(w, ff->parent); 3721 return w; 3722} 3723 3724 3725 3726/** 3727 * Execute fast "reductions" when ff stalls at a sync. 3728 * 3729 * @param w The currently executing worker. 3730 * @param ff The full frame stalling at a sync. 3731 * @return 1 if we are finished with all reductions after calling this method. 3732 * @return 0 if we still need to execute the slow path reductions. 3733 */ 3734static inline 3735int fast_path_reductions_for_sync(__cilkrts_worker *w, 3736 full_frame *ff) { 3737 // Return 0 if there is some reduction that needs to happen. 3738 return !(w->reducer_map || ff->pending_exception); 3739} 3740 3741/** 3742 * Executes slow reductions when ff stalls at a sync. 3743 * This method should execute only if 3744 * fast_path_reductions_for_sync(w, ff) returned 0. 3745 * 3746 * After this method completes: 3747 * 1. ff's current reducer map has been deposited into 3748 * right_reducer_map of ff's rightmost child, or 3749 * ff->children_reducer_map if ff has no children. 3750 * 2. Similarly for ff's current exception. 3751 * 3. Nothing to calculate for stacks --- if we are stalling 3752 * we will always free a stack. 3753 * 3754 * This method may repeatedly acquire/release the lock on ff. 3755 * 3756 * @param w The currently executing worker. 3757 * @param ff The full frame stalling at a sync. 3758 * @return The worker returning from this method. 3759 */ 3760static __cilkrts_worker* 3761slow_path_reductions_for_sync(__cilkrts_worker *w, 3762 full_frame *ff) 3763{ 3764 struct cilkred_map *left_map; 3765 struct cilkred_map *middle_map; 3766 3767#if (REDPAR_DEBUG > 0) 3768 CILK_ASSERT(ff); 3769 CILK_ASSERT(w->head == w->tail); 3770#endif 3771 3772 middle_map = w->reducer_map; 3773 w->reducer_map = NULL; 3774 3775 // Loop invariant: middle_map should be valid (the current map to reduce). 3776 // left_map is junk. 3777 // w->reducer_map == NULL. 3778 while (1) { 3779 BEGIN_WITH_FRAME_LOCK(w, ff) { 3780 splice_left_ptrs left_ptrs = compute_left_ptrs_for_sync(w, ff); 3781 3782 // Grab the "left" map and store pointers to those locations. 3783 left_map = *(left_ptrs.map_ptr); 3784 *(left_ptrs.map_ptr) = NULL; 3785 3786 // Slide the maps in our struct left as far as possible. 3787 if (!left_map) { 3788 left_map = middle_map; 3789 middle_map = NULL; 3790 } 3791 3792 *(left_ptrs.exception_ptr) = 3793 __cilkrts_merge_pending_exceptions(w, 3794 *left_ptrs.exception_ptr, 3795 ff->pending_exception); 3796 ff->pending_exception = NULL; 3797 3798 // If there is no middle map, then we are done. 3799 // Deposit left and return. 3800 if (!middle_map) { 3801 *(left_ptrs).map_ptr = left_map; 3802 #if (REDPAR_DEBUG > 0) 3803 CILK_ASSERT(NULL == w->reducer_map); 3804 #endif 3805 // Sanity check upon leaving the loop. 3806 verify_current_wkr(w); 3807 // Make sure to unlock before we return! 3808 __cilkrts_frame_unlock(w, ff); 3809 return w; 3810 } 3811 } END_WITH_FRAME_LOCK(w, ff); 3812 3813 // If we get here, we have a nontrivial reduction to execute. 3814 middle_map = repeated_merge_reducer_maps(&w, 3815 left_map, 3816 middle_map); 3817 verify_current_wkr(w); 3818 3819 // Save any exceptions generated because of the reduction 3820 // process. These get merged the next time around the 3821 // loop. 3822 CILK_ASSERT(NULL == ff->pending_exception); 3823 ff->pending_exception = w->l->pending_exception; 3824 w->l->pending_exception = NULL; 3825 } 3826 3827 // We should never break out of the loop above. 3828 CILK_ASSERT(0); 3829 return NULL; 3830} 3831 3832 3833/** 3834 * Execute reductions when ff stalls at a sync. 3835 * 3836 * Execution starts on w, but may finish on a different worker. 3837 * This method may acquire/release the lock on ff. 3838 * 3839 * @param w The currently executing worker. 3840 * @param ff The full frame of the spawned function at the sync 3841 * @param sf_at_sync The __cilkrts_stack_frame stalling at a sync 3842 * @return The worker returning from this method. 3843 */ 3844static __cilkrts_worker* 3845execute_reductions_for_sync(__cilkrts_worker *w, 3846 full_frame *ff, 3847 __cilkrts_stack_frame *sf_at_sync) 3848{ 3849 int finished_reductions; 3850 // Step B1 from reducer protocol above: 3851 // Restore runtime invariants. 3852 // 3853 // The following code for this step is almost equivalent to 3854 // the following sequence: 3855 // 1. disown(w, ff, sf_at_sync, "sync") (which itself 3856 // calls make_unrunnable(w, ff, sf_at_sync)) 3857 // 2. make_runnable(w, ff, sf_at_sync). 3858 // 3859 // The "disown" will mark the frame "sf_at_sync" 3860 // as stolen and suspended, and save its place on the stack, 3861 // so it can be resumed after the sync. 3862 // 3863 // The difference is, that we don't want the disown to 3864 // break the following connections yet, since we are 3865 // about to immediately make sf/ff runnable again anyway. 3866 // sf_at_sync->worker == w 3867 // w->l->frame_ff == ff. 3868 // 3869 // These connections are needed for parallel reductions, since 3870 // we will use sf / ff as the stack frame / full frame for 3871 // executing any potential reductions. 3872 // 3873 // TBD: Can we refactor the disown / make_unrunnable code 3874 // to avoid the code duplication here? 3875 3876 ff->call_stack = NULL; 3877 3878 // Normally, "make_unrunnable" would add CILK_FRAME_STOLEN and 3879 // CILK_FRAME_SUSPENDED to sf_at_sync->flags and save the state of 3880 // the stack so that a worker can resume the frame in the correct 3881 // place. 3882 // 3883 // But on this path, CILK_FRAME_STOLEN should already be set. 3884 // Also, we technically don't want to suspend the frame until 3885 // the reduction finishes. 3886 // We do, however, need to save the stack before 3887 // we start any reductions, since the reductions might push more 3888 // data onto the stack. 3889 CILK_ASSERT(sf_at_sync->flags | CILK_FRAME_STOLEN); 3890 3891 __cilkrts_put_stack(ff, sf_at_sync); 3892 __cilkrts_make_unrunnable_sysdep(w, ff, sf_at_sync, 1, 3893 "execute_reductions_for_sync"); 3894 CILK_ASSERT(w->l->frame_ff == ff); 3895 3896 // Step B2: Execute reductions on user stack. 3897 // Check if we have any "real" reductions to do. 3898 finished_reductions = fast_path_reductions_for_sync(w, ff); 3899 3900 if (!finished_reductions) { 3901 // Still have some real reductions to execute. 3902 // Run them here. 3903 3904 // This method may acquire/release the lock on ff. 3905 w = slow_path_reductions_for_sync(w, ff); 3906 3907 // The previous call may return on a different worker. 3908 // than what we started on. 3909 verify_current_wkr(w); 3910 } 3911 3912#if REDPAR_DEBUG >= 0 3913 CILK_ASSERT(w->l->frame_ff == ff); 3914 CILK_ASSERT(ff->call_stack == NULL); 3915#endif 3916 3917 // Now we suspend the frame ff (since we've 3918 // finished the reductions). Roughly, we've split apart the 3919 // "make_unrunnable" call here --- we've already saved the 3920 // stack info earlier before the reductions execute. 3921 // All that remains is to restore the call stack back into the 3922 // full frame, and mark the frame as suspended. 3923 ff->call_stack = sf_at_sync; 3924 sf_at_sync->flags |= CILK_FRAME_SUSPENDED; 3925 3926 // At a nontrivial sync, we should always free the current fiber, 3927 // because it can not be leftmost. 3928 w->l->fiber_to_free = ff->fiber_self; 3929 ff->fiber_self = NULL; 3930 return w; 3931} 3932 3933 3934/* 3935 Local Variables: ** 3936 c-file-style:"bsd" ** 3937 c-basic-offset:4 ** 3938 indent-tabs-mode:nil ** 3939 End: ** 3940*/ 3941