1/*
2 * Copyright (c) 2011 Apple Inc. All rights reserved.
3 *
4 * @APPLE_APACHE_LICENSE_HEADER_START@
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 *     http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 * @APPLE_APACHE_LICENSE_HEADER_END@
19 */
20/*
21    ThreadLocalCollector.cpp
22    Thread Local Collector
23    Copyright (c) 2008-2011 Apple Inc. All rights reserved.
24 */
25
26#include "ThreadLocalCollector.h"
27#include "auto_trace.h"
28#include "auto_dtrace.h"
29#include "Locks.h"
30#include "BlockRef.h"
31
32namespace Auto {
33    //
34    // is_block_aligned_range()
35    //
36    // Returns true if the range of addresses is block aligned, and therefore can be
37    // scanned in the 4 word at a time unrolled loop.
38    //
39    inline bool is_block_aligned_range(void **start, void **end) {
40        return (((uintptr_t)start | (uintptr_t)end) & mask(block_alignment)) == 0;
41    }
42
43    //
44    // append_block()
45    //
46    // Add block to the list in _tlcBuffer, irrespective of how the buffer is being used at the moment
47    //
48    inline void ThreadLocalCollector::append_block(void *block) {
49        _tlcBuffer[_tlcBufferCount++] = block;
50    }
51
52    //
53    // mark_push_block()
54    //
55    // Validates that block is a thread local block start pointer.
56    // If it is, and it is unmarked, marks block and adds block to _tlcBuffer/_tlcBufferCount.
57    //
58    inline void ThreadLocalCollector::mark_push_block(void *block) {
59        if (_zone->in_subzone_memory(block)) {
60            Subzone *subzone = Subzone::subzone(block);
61            usword_t q = subzone->quantum_index_unchecked(block);
62            if (subzone->block_is_start(q) && subzone->is_thread_local(q)) {
63                int32_t blockIndex = _localBlocks.slotIndex(block);
64                if (blockIndex != -1 && !_localBlocks.testAndSetMarked(blockIndex)) {
65                    append_block(block);
66                }
67            }
68        }
69    }
70
71    void ThreadLocalCollector::scan_stack_range(const Range &range) {
72        // set up the iteration for this range
73        void ** reference = (void **)range.address();
74        void ** const end = (void **)range.end();
75
76        // local copies of valid address info
77        const uintptr_t valid_lowest = (uintptr_t)_coverage.address();
78        const uintptr_t valid_size = (uintptr_t)_coverage.end() - valid_lowest;
79
80        // iterate through all the potential references
81
82        // TODO:  Since stack ranges are large aligned, unroll the loop using that alignment.
83        if (is_block_aligned_range(reference, end)) {
84            // On both 32 and 64 bit architectures, the smallest block size is 4 words. This loop
85            // is therefore scanning 1 quantum at a time.
86            while (reference < end) {
87                // do four at a time to get a better interleaving of code
88                void *referent0 = reference[0];
89                void *referent1 = reference[1];
90                void *referent2 = reference[2];
91                void *referent3 = reference[3];
92                reference += 4; // increment here to avoid stall on loop check
93               __builtin_prefetch(reference);
94                if (((intptr_t)referent0 - valid_lowest) < valid_size) mark_push_block(referent0);
95                if (((intptr_t)referent1 - valid_lowest) < valid_size) mark_push_block(referent1);
96                if (((intptr_t)referent2 - valid_lowest) < valid_size) mark_push_block(referent2);
97                if (((intptr_t)referent3 - valid_lowest) < valid_size) mark_push_block(referent3);
98            }
99        } else {
100            for (void *last_valid_pointer = end - 1; reference <= last_valid_pointer; ++reference) {
101                // get referent
102                void *referent = *reference;
103
104                // if is a block then check this block out
105                if (((intptr_t)referent - valid_lowest) < valid_size) {
106                    mark_push_block(referent);
107                }
108            }
109        }
110    }
111
112    void ThreadLocalCollector::scan_range(const Range &range) {
113        // set up the iteration for this range
114        void ** reference = (void **)range.address();
115        void ** const end = (void **)range.end();
116
117        // local copies of valid address info
118        const uintptr_t valid_lowest = (uintptr_t)_coverage.address();
119        const uintptr_t valid_size = (uintptr_t)_coverage.end() - valid_lowest;
120
121        // iterate through all the potential references
122        for (void *last_valid_pointer = end - 1; reference <= last_valid_pointer; ++reference) {
123            // get referent
124            void *referent = *reference;
125
126            // if is a block then check this block out
127            if (((intptr_t)referent - valid_lowest) < valid_size) {
128                mark_push_block(referent);
129            }
130        }
131    }
132
133    void ThreadLocalCollector::scan_with_layout(const Range &range, const unsigned char* map) {
134        // convert to double indirect
135        void **reference = (void **)range.address();
136        void ** const end = (void **)range.end();
137        Range subrange;
138        // while not '\0' terminator
139        while (unsigned data = *map++) {
140            // extract the skip and run
141            unsigned skip = data >> 4;
142            unsigned run = data & 0xf;
143
144            // advance the reference by the skip
145            reference += skip;
146
147            // scan runs as a range.
148            subrange.set_range(reference, reference + run);
149            if (subrange.address() < end && subrange.end() <= end) {
150                // <rdar://problem/6516045>:  make sure we only scan valid ranges.
151                scan_range(subrange);
152            } else {
153                break;
154            }
155            reference += run;
156        }
157
158        if (reference < end) {
159            // since objects can be allocated with extra data at end, scan the remainder conservatively.
160            subrange.set_range((void *)reference, end);
161            scan_range(subrange);
162        }
163    }
164
165    inline void ThreadLocalCollector::scan_local_block(Subzone *subzone, usword_t q, void *block) {
166        Range range(block, subzone->size(q));
167        const unsigned char *map = (subzone->layout(q) & AUTO_OBJECT) ? _zone->layout_map_for_block(block) : NULL;
168        if (map)
169            scan_with_layout(range, map);
170        else
171            scan_range(range);
172    }
173
174    //
175    // scan_marked_blocks
176    //
177    // scans all the blocks in _tlcBuffer
178    //
179    void ThreadLocalCollector::scan_marked_blocks() {
180        size_t index = 0;
181        while (index < _tlcBufferCount) {
182            void *block = _tlcBuffer[index++];
183            Subzone *subzone = Subzone::subzone(block);
184            usword_t q = subzone->quantum_index_unchecked(block);
185            if (subzone->should_scan_local_block(q)) {
186                scan_local_block(subzone, q, block);
187            }
188        }
189    }
190
191    //
192    // scavenge_local
193    //
194    // we can't return to the general pool because the general collector thread may
195    // still be scanning us.  Instead we return data to our cache.
196    //
197    void ThreadLocalCollector::scavenge_local(size_t count, void *garbage[]) {
198        size_t blocks_freed = 0;
199        size_t bytes_freed = 0;
200        size_t bytes_dropped = 0;
201
202        // if collection checking is on then clear the check count for all the garbage blocks
203        Zone *zone = _thread.zone();
204        if (zone->collection_checking_enabled()) {
205            zone->clear_garbage_checking_count(garbage, count);
206        }
207
208		GARBAGE_COLLECTION_COLLECTION_PHASE_BEGIN((auto_zone_t*)_zone, AUTO_TRACE_SCAVENGING_PHASE);
209        for (size_t index = 0; index < count; index++) {
210            void *block = garbage[index];
211            // Only small quantum blocks are currently allocated locally, take advantage of that.
212            Subzone *subzone = Subzone::subzone(block);
213            usword_t q = subzone->quantum_index_unchecked(block);
214            if (!subzone->has_refcount(q)) {
215                blocks_freed++;
216                size_t block_size = subzone->size(q);
217                if (malloc_logger) malloc_logger(MALLOC_LOG_TYPE_DEALLOCATE | MALLOC_LOG_TYPE_HAS_ZONE, uintptr_t(_zone), uintptr_t(block), 0, 0, 0);
218                if (!_thread.thread_cache_add(block, subzone, q)) {
219                    // drop the block on the floor and leave it for the heap collector to find
220                    subzone->allocate(q, subzone->length(q), AUTO_UNSCANNED, false, false);
221                    bytes_dropped += block_size;
222                } else {
223                    bytes_freed += block_size;
224                }
225            } else {
226                SubzoneBlockRef ref(subzone, q);
227                if (!is_zombie(block)) {
228                    _zone->handle_overretained_garbage(block, ref.refcount(), ref.layout());
229                } else {
230                    // transition the block from local garbage to retained global
231                    SpinLock lock(subzone->admin()->lock()); // zombify_internal requires we hold the admin lock
232                    subzone->allocate(q, subzone->length(q), subzone->layout(q), true, false);
233                    _zone->zombify_internal(ref);
234                }
235            }
236        }
237        if (bytes_dropped) {
238            _zone->adjust_allocation_counter(bytes_dropped);
239        }
240		GARBAGE_COLLECTION_COLLECTION_PHASE_END((auto_zone_t*)_zone, AUTO_TRACE_SCAVENGING_PHASE, (uint64_t)blocks_freed, (uint64_t)bytes_freed);
241    }
242
243    static void finalize_work(Zone *zone, const size_t garbage_count, void *garbage[]) {
244        size_t blocks_freed = 0, bytes_freed = 0;
245        zone->invalidate_garbage(garbage_count, garbage);
246        zone->free_garbage(garbage_count, garbage, 0, NULL, blocks_freed, bytes_freed);  // TODO:  all blocks are in the small admin, create a batched version.
247        zone->clear_zombies();
248        aux_free(garbage);
249    }
250
251    // assumes _tlcBuffer/_tlcBufferCount hold the garbage list
252    bool ThreadLocalCollector::block_in_garbage_list(void *block) {
253        for (size_t i=0; i<_tlcBufferCount; i++) {
254            if (_tlcBuffer[i] == block)
255                return true;
256        }
257        return false;
258    }
259
260    // Assumes _tlcBuffer/_tlcBufferCount hold the garbage list
261    void ThreadLocalCollector::evict_local_garbage() {
262        // scan the garbage blocks to evict all blocks reachable from the garbage list
263        size_t evict_cursor = _tlcBufferCount;
264
265        size_t scan_cursor = 0;
266        while (scan_cursor < _tlcBufferCount) {
267            void *block = _tlcBuffer[scan_cursor++];
268            Subzone *subzone = Subzone::subzone(block);
269            usword_t q = subzone->quantum_index_unchecked(block);
270            if (subzone->is_scanned(q)) {
271                scan_local_block(subzone, q, block);
272            }
273        }
274
275        usword_t global_size = 0;
276        while (evict_cursor < _tlcBufferCount) {
277            void *block = _tlcBuffer[evict_cursor++];
278            // evict this block, since it is reachable from garbage, but not itself garbage.
279            Subzone *subzone = Subzone::subzone(block);
280            usword_t q = subzone->quantum_index_unchecked(block);
281            subzone->make_global(q);
282            _localBlocks.remove(block);
283            global_size += subzone->size(q);
284        }
285        if (global_size != 0)
286            _zone->adjust_allocation_counter(global_size);
287    }
288
289    //
290    // process_local_garbage
291    //
292    void ThreadLocalCollector::process_local_garbage(void (*garbage_list_handler)(ThreadLocalCollector *)) {
293        // Gather the garbage blocks into _tlcBuffer, which currently holds marked blocks.
294        usword_t garbage_count = _localBlocks.count() - _tlcBufferCount;
295        if (garbage_count == 0) {
296            // no garbage
297            // TODO:  if we keep hitting this condition, we could use feedback to increase the thread local threshold.
298            _localBlocks.clearFlags();    // clears flags only.
299			GARBAGE_COLLECTION_COLLECTION_END((auto_zone_t*)_zone, 0ull, 0ull, _localBlocks.count(), (uint64_t)(-1));
300            return;
301        }
302
303        _tlcBufferCount = 0;
304        size_t scavenged_size = 0;
305
306        // use the mark bit in _localBlocks to generate a garbage list in _tlcBuffer/_tlcBufferCount
307        for (uint32_t i = _localBlocks.firstOccupiedSlot(), last = _localBlocks.lastOccupiedSlot(); (i <= last) && (_tlcBufferCount != garbage_count); i++) {
308            void *block = _localBlocks.unmarkedPointerAtIndex(i);
309            if (block) {
310                Subzone *subzone = Subzone::subzone(block);
311                usword_t q = subzone->quantum_index_unchecked(block);
312                if (subzone->is_thread_local(q)) {
313						scavenged_size += subzone->size(q);
314                    append_block(block);
315                    _localBlocks.remove(i);
316                } else {
317                    auto_error(_zone, "not thread local garbage", (const void *)block);
318                }
319            }
320        }
321#ifdef MEASURE_TLC_STATS
322        _zone->statistics().add_local_collected(_tlcBufferCount);
323#endif
324
325        // clear the marks & compact. must be done before evict_local_garbage(), which does more marking.
326        // if the thread is not suspended then we can also possibly shrink the locals list size
327        // if the thread IS suspended then we must not allocate
328        if (_thread.suspended())
329            _localBlocks.clearFlagsRehash();
330        else
331            _localBlocks.clearFlagsCompact();
332
333        AUTO_PROBE(auto_probe_end_local_scan(_tlcBufferCount, &_tlcBuffer[0]));
334
335        garbage_list_handler(this);
336
337        // skip computing the locals size if the probe is not enabled
338        if (GARBAGE_COLLECTION_COLLECTION_PHASE_END_ENABLED())
339            GARBAGE_COLLECTION_COLLECTION_END((auto_zone_t*)_zone, garbage_count, (uint64_t)scavenged_size, _localBlocks.count(), (uint64_t)_localBlocks.localsSize());
340    }
341
342    // Assumes _tlcBuffer/_tlcBufferCount hold the garbage list
343    void ThreadLocalCollector::finalize_local_garbage_now(ThreadLocalCollector *tlc) {
344        size_t garbage_count = tlc->_tlcBufferCount;
345        mark_local_garbage(tlc->_tlcBuffer, garbage_count);
346        tlc->_zone->invalidate_garbage(garbage_count, &tlc->_tlcBuffer[0]);
347        tlc->scavenge_local(garbage_count, &tlc->_tlcBuffer[0]);
348#ifdef MEASURE_TLC_STATS
349        tlc->_zone->statistics().add_recycled(garbage_count);
350#endif
351    }
352
353    inline void ThreadLocalCollector::mark_local_garbage(void **garbage_list, size_t garbage_count) {
354        for (size_t i = 0; i < garbage_count; i++) {
355            void *block = garbage_list[i];
356            Subzone *subzone = Subzone::subzone(block);
357            usword_t q = subzone->quantum_index_unchecked(block);
358            subzone->mark_local_garbage(q);
359        }
360    }
361
362    // Assumes _tlcBuffer/_tlcBufferCount hold the garbage list
363    void ThreadLocalCollector::finalize_local_garbage_later(ThreadLocalCollector *tlc) {
364        size_t garbage_count = tlc->_tlcBufferCount;
365        tlc->evict_local_garbage(); // note this modifies _tlcBuffer/_tlcBufferCount
366        mark_local_garbage(tlc->_tlcBuffer, garbage_count);
367        Zone *z = tlc->_zone;
368        void **garbage_copy = (void **)aux_malloc(garbage_count * sizeof(void *));
369        memcpy(garbage_copy, tlc->_tlcBuffer, garbage_count * sizeof(void *));
370        dispatch_async(tlc->_zone->_collection_queue, ^{ finalize_work(z, garbage_count, garbage_copy); });
371#ifdef MEASURE_TLC_STATS
372        tlc->_zone->statistics().add_global_freed(garbage_count);
373#endif
374    }
375
376    // Assumes _tlcBuffer/_tlcBufferCount hold the garbage list
377    void ThreadLocalCollector::unmark_local_garbage(ThreadLocalCollector *tlc) {
378        size_t garbage_count = tlc->_tlcBufferCount;
379        tlc->evict_local_garbage(); // note this modifies _tlcBuffer/_tlcBufferCount
380        mark_local_garbage(tlc->_tlcBuffer, garbage_count);
381        for (uint32_t i=0; i<garbage_count; i++) {
382            void *block = tlc->_tlcBuffer[i];
383            Subzone *sz = Subzone::subzone(block);
384            usword_t q = sz->quantum_index_unchecked(block);
385            sz->test_and_clear_mark(q);
386            sz->mark_global_garbage(q);
387        }
388#ifdef MEASURE_TLC_STATS
389        tlc->_zone->statistics().add_global_freed(garbage_count);
390#endif
391    }
392
393    //
394    // should_collect
395    //
396    bool ThreadLocalCollector::should_collect(Zone *zone, Thread &thread, bool canFinalizeNow) {
397        if (thread.thread_local_collector() == NULL) {
398            if (canFinalizeNow) {
399                // Since we have permission to finalize now, our criteria for collections is simply that there are some
400                // bare minimum number of thread local objects. I strongly suggest that we also consider allocation thresholds
401                // for this trigger.
402                return (thread.locals().count() >= (local_allocations_size_limit/10));
403            } else {
404                // If the count has reached the set size limit then try to collect to make space even though we can't finalize.
405                if (zone->_collection_queue) {
406                    return (thread.locals().count() >= local_allocations_size_limit);
407                }
408            }
409        }
410        return false;
411    }
412
413    bool ThreadLocalCollector::should_collect_suspended(Thread &thread)
414    {
415        assert(thread.suspended());
416        // Don't do a suspended scan if malloc stack logging is turned on. If the thread happens to be in the middle of an allocation,
417        // TLC's own use of the aux_zone() will deadlock.
418        bool collect = (malloc_logger == NULL) && thread.tlc_watchdog_should_trigger() && !Sentinel::is_guarded(thread.localsGuard()) && thread.locals().count() > 0;
419        if (collect)
420            thread.tlc_watchdog_disable();
421        else
422            thread.tlc_watchdog_tickle();
423        return collect;
424    }
425
426
427#ifndef __BLOCKS__
428    class thread_local_scanner_helper : public Thread::thread_scanner {
429        ThreadLocalCollector &_collector;
430    public:
431        thread_local_scanner_helper(ThreadLocalCollector &collector) : _collector(collector) {}
432        virtual void operator() (Thread *thread, Range &range) { _collector.scan_stack_range(range); }
433    };
434#endif
435
436
437    void ThreadLocalCollector::trace_scanning_phase_end() {
438        size_t scanned_size = 0;
439        for (usword_t i = 0; i < _tlcBufferCount; i++) {
440            void *block = _tlcBuffer[i++];
441            Subzone *subzone = Subzone::subzone(block);
442            usword_t q = subzone->quantum_index_unchecked(block);
443            if (subzone->should_scan_local_block(q)) {
444                scanned_size += subzone->size(q);
445            }
446        }
447        GARBAGE_COLLECTION_COLLECTION_PHASE_END((auto_zone_t*)_zone, AUTO_TRACE_SCANNING_PHASE, (uint64_t)_tlcBufferCount, (uint64_t)scanned_size);
448    }
449
450    //
451    // collect
452    //
453    void ThreadLocalCollector::collect(bool finalizeNow) {
454        AUTO_PROBE(auto_probe_begin_local_scan());
455        assert(_thread.thread_local_collector() == NULL);
456        _thread.set_thread_local_collector(this);
457
458        _thread.tlc_watchdog_reset();
459
460		GARBAGE_COLLECTION_COLLECTION_BEGIN((auto_zone_t*)_zone, AUTO_TRACE_LOCAL);
461
462		GARBAGE_COLLECTION_COLLECTION_PHASE_BEGIN((auto_zone_t*)_zone, AUTO_TRACE_SCANNING_PHASE);
463
464#ifdef __BLOCKS__
465        // scan the stack for the first set of hits
466        _thread.scan_current_thread(^(Thread *thread, const Range &range) {
467            this->scan_stack_range(range);
468        }, _stack_bottom);
469#else
470        thread_local_scanner_helper helper(*this);
471        _thread.scan_current_thread(helper, _stack_bottom);
472#endif
473
474        // recurse on what are now the roots
475        scan_marked_blocks();
476
477        if (GARBAGE_COLLECTION_COLLECTION_PHASE_END_ENABLED()) {
478            trace_scanning_phase_end();
479        }
480        process_local_garbage(finalizeNow ? finalize_local_garbage_now : finalize_local_garbage_later);
481
482        _thread.set_thread_local_collector(NULL);
483
484        if (_localBlocks.count() > local_allocations_size_limit/2)
485            _thread.flush_local_blocks();
486
487        AUTO_PROBE(auto_probe_local_collection_complete());
488    }
489
490    //
491    // collect_suspended
492    //
493    void ThreadLocalCollector::collect_suspended(Range &registers, Range &stack) {
494        AUTO_PROBE(auto_probe_begin_local_scan());
495        assert(_thread.thread_local_collector() == NULL);
496        assert(_thread.suspended());
497        _thread.set_thread_local_collector(this);
498
499		GARBAGE_COLLECTION_COLLECTION_BEGIN((auto_zone_t*)_zone, AUTO_TRACE_LOCAL);
500
501		GARBAGE_COLLECTION_COLLECTION_PHASE_BEGIN((auto_zone_t*)_zone, AUTO_TRACE_SCANNING_PHASE);
502
503        scan_range(stack);
504        scan_range(registers);
505
506        // recurse on what are now the roots
507        scan_marked_blocks();
508
509        if (GARBAGE_COLLECTION_COLLECTION_PHASE_END_ENABLED()) {
510            trace_scanning_phase_end();
511        }
512
513        process_local_garbage(unmark_local_garbage);
514
515        _thread.set_thread_local_collector(NULL);
516
517        AUTO_PROBE(auto_probe_local_collection_complete());
518    }
519
520    void ThreadLocalCollector::reap_all() {
521        GARBAGE_COLLECTION_COLLECTION_BEGIN((auto_zone_t*)_zone, AUTO_TRACE_LOCAL);
522        _thread.set_thread_local_collector(this);
523        process_local_garbage(finalize_local_garbage_now);
524        _thread.set_thread_local_collector(NULL);
525    }
526
527    //
528    // eject_local_block
529    //
530    // removes block and all referenced stack local blocks
531    //
532    void ThreadLocalCollector::eject_local_block(void *startingBlock) {
533        if (_thread.thread_local_collector() != NULL) {
534            // if a thread localcollection is in progress then we can't use the tlc buffer in the thread object
535            _tlcBuffer = (void **)malloc(local_allocations_size_limit * sizeof(void *));
536        }
537        Subzone *subzone = Subzone::subzone(startingBlock);
538#ifndef NDEBUG
539        {
540            usword_t q;
541            assert(subzone->block_is_start(startingBlock, &q) && subzone->is_thread_local(q));
542            assert(_localBlocks.slotIndex(startingBlock) != -1);
543        }
544#endif
545
546        mark_push_block(startingBlock);
547
548        // mark all local blocks reachable from this block.
549        scan_marked_blocks();
550
551        // loop over all marked blocks, and mark them as global.
552        size_t evicted_size = 0;
553        for (size_t i = 0; i < _tlcBufferCount; i++) {
554            void *block = _tlcBuffer[i];
555            subzone = Subzone::subzone(block);
556            usword_t q = subzone->quantum_index_unchecked(block);
557            assert(subzone->is_thread_local(q));
558            subzone->make_global(q);
559            _localBlocks.remove(block);
560            evicted_size += subzone->size(q);
561        }
562        // Assertion:  No need to clear flags because all objects marked were removed.
563        _zone->adjust_allocation_counter(evicted_size);
564
565        if (_thread.thread_local_collector() != NULL) {
566            free(_tlcBuffer);
567        }
568    }
569
570    void ThreadLocalCollector::add_zombie(void *block) {
571        if (!_zombies)
572            _zombies = new PtrHashSet();
573
574        if (_zombies->find(block) == _zombies->end()) {
575            _zombies->insert(block);
576        }
577    }
578
579    inline bool ThreadLocalCollector::is_zombie(void *block) {
580        if (_zombies) {
581            PtrHashSet::iterator iter = _zombies->find(block);
582            return (iter != _zombies->end());
583        } else {
584            return false;
585        }
586    }
587}
588