1/* 2 * Copyright (c) 2011 Apple Inc. All rights reserved. 3 * 4 * @APPLE_APACHE_LICENSE_HEADER_START@ 5 * 6 * Licensed under the Apache License, Version 2.0 (the "License"); 7 * you may not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 * 18 * @APPLE_APACHE_LICENSE_HEADER_END@ 19 */ 20/* 21 ThreadLocalCollector.cpp 22 Thread Local Collector 23 Copyright (c) 2008-2011 Apple Inc. All rights reserved. 24 */ 25 26#include "ThreadLocalCollector.h" 27#include "auto_trace.h" 28#include "auto_dtrace.h" 29#include "Locks.h" 30#include "BlockRef.h" 31 32namespace Auto { 33 // 34 // is_block_aligned_range() 35 // 36 // Returns true if the range of addresses is block aligned, and therefore can be 37 // scanned in the 4 word at a time unrolled loop. 38 // 39 inline bool is_block_aligned_range(void **start, void **end) { 40 return (((uintptr_t)start | (uintptr_t)end) & mask(block_alignment)) == 0; 41 } 42 43 // 44 // append_block() 45 // 46 // Add block to the list in _tlcBuffer, irrespective of how the buffer is being used at the moment 47 // 48 inline void ThreadLocalCollector::append_block(void *block) { 49 _tlcBuffer[_tlcBufferCount++] = block; 50 } 51 52 // 53 // mark_push_block() 54 // 55 // Validates that block is a thread local block start pointer. 56 // If it is, and it is unmarked, marks block and adds block to _tlcBuffer/_tlcBufferCount. 57 // 58 inline void ThreadLocalCollector::mark_push_block(void *block) { 59 if (_zone->in_subzone_memory(block)) { 60 Subzone *subzone = Subzone::subzone(block); 61 usword_t q = subzone->quantum_index_unchecked(block); 62 if (subzone->block_is_start(q) && subzone->is_thread_local(q)) { 63 int32_t blockIndex = _localBlocks.slotIndex(block); 64 if (blockIndex != -1 && !_localBlocks.testAndSetMarked(blockIndex)) { 65 append_block(block); 66 } 67 } 68 } 69 } 70 71 void ThreadLocalCollector::scan_stack_range(const Range &range) { 72 // set up the iteration for this range 73 void ** reference = (void **)range.address(); 74 void ** const end = (void **)range.end(); 75 76 // local copies of valid address info 77 const uintptr_t valid_lowest = (uintptr_t)_coverage.address(); 78 const uintptr_t valid_size = (uintptr_t)_coverage.end() - valid_lowest; 79 80 // iterate through all the potential references 81 82 // TODO: Since stack ranges are large aligned, unroll the loop using that alignment. 83 if (is_block_aligned_range(reference, end)) { 84 // On both 32 and 64 bit architectures, the smallest block size is 4 words. This loop 85 // is therefore scanning 1 quantum at a time. 86 while (reference < end) { 87 // do four at a time to get a better interleaving of code 88 void *referent0 = reference[0]; 89 void *referent1 = reference[1]; 90 void *referent2 = reference[2]; 91 void *referent3 = reference[3]; 92 reference += 4; // increment here to avoid stall on loop check 93 __builtin_prefetch(reference); 94 if (((intptr_t)referent0 - valid_lowest) < valid_size) mark_push_block(referent0); 95 if (((intptr_t)referent1 - valid_lowest) < valid_size) mark_push_block(referent1); 96 if (((intptr_t)referent2 - valid_lowest) < valid_size) mark_push_block(referent2); 97 if (((intptr_t)referent3 - valid_lowest) < valid_size) mark_push_block(referent3); 98 } 99 } else { 100 for (void *last_valid_pointer = end - 1; reference <= last_valid_pointer; ++reference) { 101 // get referent 102 void *referent = *reference; 103 104 // if is a block then check this block out 105 if (((intptr_t)referent - valid_lowest) < valid_size) { 106 mark_push_block(referent); 107 } 108 } 109 } 110 } 111 112 void ThreadLocalCollector::scan_range(const Range &range) { 113 // set up the iteration for this range 114 void ** reference = (void **)range.address(); 115 void ** const end = (void **)range.end(); 116 117 // local copies of valid address info 118 const uintptr_t valid_lowest = (uintptr_t)_coverage.address(); 119 const uintptr_t valid_size = (uintptr_t)_coverage.end() - valid_lowest; 120 121 // iterate through all the potential references 122 for (void *last_valid_pointer = end - 1; reference <= last_valid_pointer; ++reference) { 123 // get referent 124 void *referent = *reference; 125 126 // if is a block then check this block out 127 if (((intptr_t)referent - valid_lowest) < valid_size) { 128 mark_push_block(referent); 129 } 130 } 131 } 132 133 void ThreadLocalCollector::scan_with_layout(const Range &range, const unsigned char* map) { 134 // convert to double indirect 135 void **reference = (void **)range.address(); 136 void ** const end = (void **)range.end(); 137 Range subrange; 138 // while not '\0' terminator 139 while (unsigned data = *map++) { 140 // extract the skip and run 141 unsigned skip = data >> 4; 142 unsigned run = data & 0xf; 143 144 // advance the reference by the skip 145 reference += skip; 146 147 // scan runs as a range. 148 subrange.set_range(reference, reference + run); 149 if (subrange.address() < end && subrange.end() <= end) { 150 // <rdar://problem/6516045>: make sure we only scan valid ranges. 151 scan_range(subrange); 152 } else { 153 break; 154 } 155 reference += run; 156 } 157 158 if (reference < end) { 159 // since objects can be allocated with extra data at end, scan the remainder conservatively. 160 subrange.set_range((void *)reference, end); 161 scan_range(subrange); 162 } 163 } 164 165 inline void ThreadLocalCollector::scan_local_block(Subzone *subzone, usword_t q, void *block) { 166 Range range(block, subzone->size(q)); 167 const unsigned char *map = (subzone->layout(q) & AUTO_OBJECT) ? _zone->layout_map_for_block(block) : NULL; 168 if (map) 169 scan_with_layout(range, map); 170 else 171 scan_range(range); 172 } 173 174 // 175 // scan_marked_blocks 176 // 177 // scans all the blocks in _tlcBuffer 178 // 179 void ThreadLocalCollector::scan_marked_blocks() { 180 size_t index = 0; 181 while (index < _tlcBufferCount) { 182 void *block = _tlcBuffer[index++]; 183 Subzone *subzone = Subzone::subzone(block); 184 usword_t q = subzone->quantum_index_unchecked(block); 185 if (subzone->should_scan_local_block(q)) { 186 scan_local_block(subzone, q, block); 187 } 188 } 189 } 190 191 // 192 // scavenge_local 193 // 194 // we can't return to the general pool because the general collector thread may 195 // still be scanning us. Instead we return data to our cache. 196 // 197 void ThreadLocalCollector::scavenge_local(size_t count, void *garbage[]) { 198 size_t blocks_freed = 0; 199 size_t bytes_freed = 0; 200 size_t bytes_dropped = 0; 201 202 // if collection checking is on then clear the check count for all the garbage blocks 203 Zone *zone = _thread.zone(); 204 if (zone->collection_checking_enabled()) { 205 zone->clear_garbage_checking_count(garbage, count); 206 } 207 208 GARBAGE_COLLECTION_COLLECTION_PHASE_BEGIN((auto_zone_t*)_zone, AUTO_TRACE_SCAVENGING_PHASE); 209 for (size_t index = 0; index < count; index++) { 210 void *block = garbage[index]; 211 // Only small quantum blocks are currently allocated locally, take advantage of that. 212 Subzone *subzone = Subzone::subzone(block); 213 usword_t q = subzone->quantum_index_unchecked(block); 214 if (!subzone->has_refcount(q)) { 215 blocks_freed++; 216 size_t block_size = subzone->size(q); 217 if (malloc_logger) malloc_logger(MALLOC_LOG_TYPE_DEALLOCATE | MALLOC_LOG_TYPE_HAS_ZONE, uintptr_t(_zone), uintptr_t(block), 0, 0, 0); 218 if (!_thread.thread_cache_add(block, subzone, q)) { 219 // drop the block on the floor and leave it for the heap collector to find 220 subzone->allocate(q, subzone->length(q), AUTO_UNSCANNED, false, false); 221 bytes_dropped += block_size; 222 } else { 223 bytes_freed += block_size; 224 } 225 } else { 226 SubzoneBlockRef ref(subzone, q); 227 if (!is_zombie(block)) { 228 _zone->handle_overretained_garbage(block, ref.refcount(), ref.layout()); 229 } else { 230 // transition the block from local garbage to retained global 231 SpinLock lock(subzone->admin()->lock()); // zombify_internal requires we hold the admin lock 232 subzone->allocate(q, subzone->length(q), subzone->layout(q), true, false); 233 _zone->zombify_internal(ref); 234 } 235 } 236 } 237 if (bytes_dropped) { 238 _zone->adjust_allocation_counter(bytes_dropped); 239 } 240 GARBAGE_COLLECTION_COLLECTION_PHASE_END((auto_zone_t*)_zone, AUTO_TRACE_SCAVENGING_PHASE, (uint64_t)blocks_freed, (uint64_t)bytes_freed); 241 } 242 243 static void finalize_work(Zone *zone, const size_t garbage_count, void *garbage[]) { 244 size_t blocks_freed = 0, bytes_freed = 0; 245 zone->invalidate_garbage(garbage_count, garbage); 246 zone->free_garbage(garbage_count, garbage, 0, NULL, blocks_freed, bytes_freed); // TODO: all blocks are in the small admin, create a batched version. 247 zone->clear_zombies(); 248 aux_free(garbage); 249 } 250 251 // assumes _tlcBuffer/_tlcBufferCount hold the garbage list 252 bool ThreadLocalCollector::block_in_garbage_list(void *block) { 253 for (size_t i=0; i<_tlcBufferCount; i++) { 254 if (_tlcBuffer[i] == block) 255 return true; 256 } 257 return false; 258 } 259 260 // Assumes _tlcBuffer/_tlcBufferCount hold the garbage list 261 void ThreadLocalCollector::evict_local_garbage() { 262 // scan the garbage blocks to evict all blocks reachable from the garbage list 263 size_t evict_cursor = _tlcBufferCount; 264 265 size_t scan_cursor = 0; 266 while (scan_cursor < _tlcBufferCount) { 267 void *block = _tlcBuffer[scan_cursor++]; 268 Subzone *subzone = Subzone::subzone(block); 269 usword_t q = subzone->quantum_index_unchecked(block); 270 if (subzone->is_scanned(q)) { 271 scan_local_block(subzone, q, block); 272 } 273 } 274 275 usword_t global_size = 0; 276 while (evict_cursor < _tlcBufferCount) { 277 void *block = _tlcBuffer[evict_cursor++]; 278 // evict this block, since it is reachable from garbage, but not itself garbage. 279 Subzone *subzone = Subzone::subzone(block); 280 usword_t q = subzone->quantum_index_unchecked(block); 281 subzone->make_global(q); 282 _localBlocks.remove(block); 283 global_size += subzone->size(q); 284 } 285 if (global_size != 0) 286 _zone->adjust_allocation_counter(global_size); 287 } 288 289 // 290 // process_local_garbage 291 // 292 void ThreadLocalCollector::process_local_garbage(void (*garbage_list_handler)(ThreadLocalCollector *)) { 293 // Gather the garbage blocks into _tlcBuffer, which currently holds marked blocks. 294 usword_t garbage_count = _localBlocks.count() - _tlcBufferCount; 295 if (garbage_count == 0) { 296 // no garbage 297 // TODO: if we keep hitting this condition, we could use feedback to increase the thread local threshold. 298 _localBlocks.clearFlags(); // clears flags only. 299 GARBAGE_COLLECTION_COLLECTION_END((auto_zone_t*)_zone, 0ull, 0ull, _localBlocks.count(), (uint64_t)(-1)); 300 return; 301 } 302 303 _tlcBufferCount = 0; 304 size_t scavenged_size = 0; 305 306 // use the mark bit in _localBlocks to generate a garbage list in _tlcBuffer/_tlcBufferCount 307 for (uint32_t i = _localBlocks.firstOccupiedSlot(), last = _localBlocks.lastOccupiedSlot(); (i <= last) && (_tlcBufferCount != garbage_count); i++) { 308 void *block = _localBlocks.unmarkedPointerAtIndex(i); 309 if (block) { 310 Subzone *subzone = Subzone::subzone(block); 311 usword_t q = subzone->quantum_index_unchecked(block); 312 if (subzone->is_thread_local(q)) { 313 scavenged_size += subzone->size(q); 314 append_block(block); 315 _localBlocks.remove(i); 316 } else { 317 auto_error(_zone, "not thread local garbage", (const void *)block); 318 } 319 } 320 } 321#ifdef MEASURE_TLC_STATS 322 _zone->statistics().add_local_collected(_tlcBufferCount); 323#endif 324 325 // clear the marks & compact. must be done before evict_local_garbage(), which does more marking. 326 // if the thread is not suspended then we can also possibly shrink the locals list size 327 // if the thread IS suspended then we must not allocate 328 if (_thread.suspended()) 329 _localBlocks.clearFlagsRehash(); 330 else 331 _localBlocks.clearFlagsCompact(); 332 333 AUTO_PROBE(auto_probe_end_local_scan(_tlcBufferCount, &_tlcBuffer[0])); 334 335 garbage_list_handler(this); 336 337 // skip computing the locals size if the probe is not enabled 338 if (GARBAGE_COLLECTION_COLLECTION_PHASE_END_ENABLED()) 339 GARBAGE_COLLECTION_COLLECTION_END((auto_zone_t*)_zone, garbage_count, (uint64_t)scavenged_size, _localBlocks.count(), (uint64_t)_localBlocks.localsSize()); 340 } 341 342 // Assumes _tlcBuffer/_tlcBufferCount hold the garbage list 343 void ThreadLocalCollector::finalize_local_garbage_now(ThreadLocalCollector *tlc) { 344 size_t garbage_count = tlc->_tlcBufferCount; 345 mark_local_garbage(tlc->_tlcBuffer, garbage_count); 346 tlc->_zone->invalidate_garbage(garbage_count, &tlc->_tlcBuffer[0]); 347 tlc->scavenge_local(garbage_count, &tlc->_tlcBuffer[0]); 348#ifdef MEASURE_TLC_STATS 349 tlc->_zone->statistics().add_recycled(garbage_count); 350#endif 351 } 352 353 inline void ThreadLocalCollector::mark_local_garbage(void **garbage_list, size_t garbage_count) { 354 for (size_t i = 0; i < garbage_count; i++) { 355 void *block = garbage_list[i]; 356 Subzone *subzone = Subzone::subzone(block); 357 usword_t q = subzone->quantum_index_unchecked(block); 358 subzone->mark_local_garbage(q); 359 } 360 } 361 362 // Assumes _tlcBuffer/_tlcBufferCount hold the garbage list 363 void ThreadLocalCollector::finalize_local_garbage_later(ThreadLocalCollector *tlc) { 364 size_t garbage_count = tlc->_tlcBufferCount; 365 tlc->evict_local_garbage(); // note this modifies _tlcBuffer/_tlcBufferCount 366 mark_local_garbage(tlc->_tlcBuffer, garbage_count); 367 Zone *z = tlc->_zone; 368 void **garbage_copy = (void **)aux_malloc(garbage_count * sizeof(void *)); 369 memcpy(garbage_copy, tlc->_tlcBuffer, garbage_count * sizeof(void *)); 370 dispatch_async(tlc->_zone->_collection_queue, ^{ finalize_work(z, garbage_count, garbage_copy); }); 371#ifdef MEASURE_TLC_STATS 372 tlc->_zone->statistics().add_global_freed(garbage_count); 373#endif 374 } 375 376 // Assumes _tlcBuffer/_tlcBufferCount hold the garbage list 377 void ThreadLocalCollector::unmark_local_garbage(ThreadLocalCollector *tlc) { 378 size_t garbage_count = tlc->_tlcBufferCount; 379 tlc->evict_local_garbage(); // note this modifies _tlcBuffer/_tlcBufferCount 380 mark_local_garbage(tlc->_tlcBuffer, garbage_count); 381 for (uint32_t i=0; i<garbage_count; i++) { 382 void *block = tlc->_tlcBuffer[i]; 383 Subzone *sz = Subzone::subzone(block); 384 usword_t q = sz->quantum_index_unchecked(block); 385 sz->test_and_clear_mark(q); 386 sz->mark_global_garbage(q); 387 } 388#ifdef MEASURE_TLC_STATS 389 tlc->_zone->statistics().add_global_freed(garbage_count); 390#endif 391 } 392 393 // 394 // should_collect 395 // 396 bool ThreadLocalCollector::should_collect(Zone *zone, Thread &thread, bool canFinalizeNow) { 397 if (thread.thread_local_collector() == NULL) { 398 if (canFinalizeNow) { 399 // Since we have permission to finalize now, our criteria for collections is simply that there are some 400 // bare minimum number of thread local objects. I strongly suggest that we also consider allocation thresholds 401 // for this trigger. 402 return (thread.locals().count() >= (local_allocations_size_limit/10)); 403 } else { 404 // If the count has reached the set size limit then try to collect to make space even though we can't finalize. 405 if (zone->_collection_queue) { 406 return (thread.locals().count() >= local_allocations_size_limit); 407 } 408 } 409 } 410 return false; 411 } 412 413 bool ThreadLocalCollector::should_collect_suspended(Thread &thread) 414 { 415 assert(thread.suspended()); 416 // Don't do a suspended scan if malloc stack logging is turned on. If the thread happens to be in the middle of an allocation, 417 // TLC's own use of the aux_zone() will deadlock. 418 bool collect = (malloc_logger == NULL) && thread.tlc_watchdog_should_trigger() && !Sentinel::is_guarded(thread.localsGuard()) && thread.locals().count() > 0; 419 if (collect) 420 thread.tlc_watchdog_disable(); 421 else 422 thread.tlc_watchdog_tickle(); 423 return collect; 424 } 425 426 427#ifndef __BLOCKS__ 428 class thread_local_scanner_helper : public Thread::thread_scanner { 429 ThreadLocalCollector &_collector; 430 public: 431 thread_local_scanner_helper(ThreadLocalCollector &collector) : _collector(collector) {} 432 virtual void operator() (Thread *thread, Range &range) { _collector.scan_stack_range(range); } 433 }; 434#endif 435 436 437 void ThreadLocalCollector::trace_scanning_phase_end() { 438 size_t scanned_size = 0; 439 for (usword_t i = 0; i < _tlcBufferCount; i++) { 440 void *block = _tlcBuffer[i++]; 441 Subzone *subzone = Subzone::subzone(block); 442 usword_t q = subzone->quantum_index_unchecked(block); 443 if (subzone->should_scan_local_block(q)) { 444 scanned_size += subzone->size(q); 445 } 446 } 447 GARBAGE_COLLECTION_COLLECTION_PHASE_END((auto_zone_t*)_zone, AUTO_TRACE_SCANNING_PHASE, (uint64_t)_tlcBufferCount, (uint64_t)scanned_size); 448 } 449 450 // 451 // collect 452 // 453 void ThreadLocalCollector::collect(bool finalizeNow) { 454 AUTO_PROBE(auto_probe_begin_local_scan()); 455 assert(_thread.thread_local_collector() == NULL); 456 _thread.set_thread_local_collector(this); 457 458 _thread.tlc_watchdog_reset(); 459 460 GARBAGE_COLLECTION_COLLECTION_BEGIN((auto_zone_t*)_zone, AUTO_TRACE_LOCAL); 461 462 GARBAGE_COLLECTION_COLLECTION_PHASE_BEGIN((auto_zone_t*)_zone, AUTO_TRACE_SCANNING_PHASE); 463 464#ifdef __BLOCKS__ 465 // scan the stack for the first set of hits 466 _thread.scan_current_thread(^(Thread *thread, const Range &range) { 467 this->scan_stack_range(range); 468 }, _stack_bottom); 469#else 470 thread_local_scanner_helper helper(*this); 471 _thread.scan_current_thread(helper, _stack_bottom); 472#endif 473 474 // recurse on what are now the roots 475 scan_marked_blocks(); 476 477 if (GARBAGE_COLLECTION_COLLECTION_PHASE_END_ENABLED()) { 478 trace_scanning_phase_end(); 479 } 480 process_local_garbage(finalizeNow ? finalize_local_garbage_now : finalize_local_garbage_later); 481 482 _thread.set_thread_local_collector(NULL); 483 484 if (_localBlocks.count() > local_allocations_size_limit/2) 485 _thread.flush_local_blocks(); 486 487 AUTO_PROBE(auto_probe_local_collection_complete()); 488 } 489 490 // 491 // collect_suspended 492 // 493 void ThreadLocalCollector::collect_suspended(Range ®isters, Range &stack) { 494 AUTO_PROBE(auto_probe_begin_local_scan()); 495 assert(_thread.thread_local_collector() == NULL); 496 assert(_thread.suspended()); 497 _thread.set_thread_local_collector(this); 498 499 GARBAGE_COLLECTION_COLLECTION_BEGIN((auto_zone_t*)_zone, AUTO_TRACE_LOCAL); 500 501 GARBAGE_COLLECTION_COLLECTION_PHASE_BEGIN((auto_zone_t*)_zone, AUTO_TRACE_SCANNING_PHASE); 502 503 scan_range(stack); 504 scan_range(registers); 505 506 // recurse on what are now the roots 507 scan_marked_blocks(); 508 509 if (GARBAGE_COLLECTION_COLLECTION_PHASE_END_ENABLED()) { 510 trace_scanning_phase_end(); 511 } 512 513 process_local_garbage(unmark_local_garbage); 514 515 _thread.set_thread_local_collector(NULL); 516 517 AUTO_PROBE(auto_probe_local_collection_complete()); 518 } 519 520 void ThreadLocalCollector::reap_all() { 521 GARBAGE_COLLECTION_COLLECTION_BEGIN((auto_zone_t*)_zone, AUTO_TRACE_LOCAL); 522 _thread.set_thread_local_collector(this); 523 process_local_garbage(finalize_local_garbage_now); 524 _thread.set_thread_local_collector(NULL); 525 } 526 527 // 528 // eject_local_block 529 // 530 // removes block and all referenced stack local blocks 531 // 532 void ThreadLocalCollector::eject_local_block(void *startingBlock) { 533 if (_thread.thread_local_collector() != NULL) { 534 // if a thread localcollection is in progress then we can't use the tlc buffer in the thread object 535 _tlcBuffer = (void **)malloc(local_allocations_size_limit * sizeof(void *)); 536 } 537 Subzone *subzone = Subzone::subzone(startingBlock); 538#ifndef NDEBUG 539 { 540 usword_t q; 541 assert(subzone->block_is_start(startingBlock, &q) && subzone->is_thread_local(q)); 542 assert(_localBlocks.slotIndex(startingBlock) != -1); 543 } 544#endif 545 546 mark_push_block(startingBlock); 547 548 // mark all local blocks reachable from this block. 549 scan_marked_blocks(); 550 551 // loop over all marked blocks, and mark them as global. 552 size_t evicted_size = 0; 553 for (size_t i = 0; i < _tlcBufferCount; i++) { 554 void *block = _tlcBuffer[i]; 555 subzone = Subzone::subzone(block); 556 usword_t q = subzone->quantum_index_unchecked(block); 557 assert(subzone->is_thread_local(q)); 558 subzone->make_global(q); 559 _localBlocks.remove(block); 560 evicted_size += subzone->size(q); 561 } 562 // Assertion: No need to clear flags because all objects marked were removed. 563 _zone->adjust_allocation_counter(evicted_size); 564 565 if (_thread.thread_local_collector() != NULL) { 566 free(_tlcBuffer); 567 } 568 } 569 570 void ThreadLocalCollector::add_zombie(void *block) { 571 if (!_zombies) 572 _zombies = new PtrHashSet(); 573 574 if (_zombies->find(block) == _zombies->end()) { 575 _zombies->insert(block); 576 } 577 } 578 579 inline bool ThreadLocalCollector::is_zombie(void *block) { 580 if (_zombies) { 581 PtrHashSet::iterator iter = _zombies->find(block); 582 return (iter != _zombies->end()); 583 } else { 584 return false; 585 } 586 } 587} 588