tsan_clock.cpp revision 360784
1//===-- tsan_clock.cpp ----------------------------------------------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file is a part of ThreadSanitizer (TSan), a race detector. 10// 11//===----------------------------------------------------------------------===// 12#include "tsan_clock.h" 13#include "tsan_rtl.h" 14#include "sanitizer_common/sanitizer_placement_new.h" 15 16// SyncClock and ThreadClock implement vector clocks for sync variables 17// (mutexes, atomic variables, file descriptors, etc) and threads, respectively. 18// ThreadClock contains fixed-size vector clock for maximum number of threads. 19// SyncClock contains growable vector clock for currently necessary number of 20// threads. 21// Together they implement very simple model of operations, namely: 22// 23// void ThreadClock::acquire(const SyncClock *src) { 24// for (int i = 0; i < kMaxThreads; i++) 25// clock[i] = max(clock[i], src->clock[i]); 26// } 27// 28// void ThreadClock::release(SyncClock *dst) const { 29// for (int i = 0; i < kMaxThreads; i++) 30// dst->clock[i] = max(dst->clock[i], clock[i]); 31// } 32// 33// void ThreadClock::ReleaseStore(SyncClock *dst) const { 34// for (int i = 0; i < kMaxThreads; i++) 35// dst->clock[i] = clock[i]; 36// } 37// 38// void ThreadClock::acq_rel(SyncClock *dst) { 39// acquire(dst); 40// release(dst); 41// } 42// 43// Conformance to this model is extensively verified in tsan_clock_test.cpp. 44// However, the implementation is significantly more complex. The complexity 45// allows to implement important classes of use cases in O(1) instead of O(N). 46// 47// The use cases are: 48// 1. Singleton/once atomic that has a single release-store operation followed 49// by zillions of acquire-loads (the acquire-load is O(1)). 50// 2. Thread-local mutex (both lock and unlock can be O(1)). 51// 3. Leaf mutex (unlock is O(1)). 52// 4. A mutex shared by 2 threads (both lock and unlock can be O(1)). 53// 5. An atomic with a single writer (writes can be O(1)). 54// The implementation dynamically adopts to workload. So if an atomic is in 55// read-only phase, these reads will be O(1); if it later switches to read/write 56// phase, the implementation will correctly handle that by switching to O(N). 57// 58// Thread-safety note: all const operations on SyncClock's are conducted under 59// a shared lock; all non-const operations on SyncClock's are conducted under 60// an exclusive lock; ThreadClock's are private to respective threads and so 61// do not need any protection. 62// 63// Description of SyncClock state: 64// clk_ - variable size vector clock, low kClkBits hold timestamp, 65// the remaining bits hold "acquired" flag (the actual value is thread's 66// reused counter); 67// if acquried == thr->reused_, then the respective thread has already 68// acquired this clock (except possibly for dirty elements). 69// dirty_ - holds up to two indeces in the vector clock that other threads 70// need to acquire regardless of "acquired" flag value; 71// release_store_tid_ - denotes that the clock state is a result of 72// release-store operation by the thread with release_store_tid_ index. 73// release_store_reused_ - reuse count of release_store_tid_. 74 75// We don't have ThreadState in these methods, so this is an ugly hack that 76// works only in C++. 77#if !SANITIZER_GO 78# define CPP_STAT_INC(typ) StatInc(cur_thread(), typ) 79#else 80# define CPP_STAT_INC(typ) (void)0 81#endif 82 83namespace __tsan { 84 85static atomic_uint32_t *ref_ptr(ClockBlock *cb) { 86 return reinterpret_cast<atomic_uint32_t *>(&cb->table[ClockBlock::kRefIdx]); 87} 88 89// Drop reference to the first level block idx. 90static void UnrefClockBlock(ClockCache *c, u32 idx, uptr blocks) { 91 ClockBlock *cb = ctx->clock_alloc.Map(idx); 92 atomic_uint32_t *ref = ref_ptr(cb); 93 u32 v = atomic_load(ref, memory_order_acquire); 94 for (;;) { 95 CHECK_GT(v, 0); 96 if (v == 1) 97 break; 98 if (atomic_compare_exchange_strong(ref, &v, v - 1, memory_order_acq_rel)) 99 return; 100 } 101 // First level block owns second level blocks, so them as well. 102 for (uptr i = 0; i < blocks; i++) 103 ctx->clock_alloc.Free(c, cb->table[ClockBlock::kBlockIdx - i]); 104 ctx->clock_alloc.Free(c, idx); 105} 106 107ThreadClock::ThreadClock(unsigned tid, unsigned reused) 108 : tid_(tid) 109 , reused_(reused + 1) // 0 has special meaning 110 , cached_idx_() 111 , cached_size_() 112 , cached_blocks_() { 113 CHECK_LT(tid, kMaxTidInClock); 114 CHECK_EQ(reused_, ((u64)reused_ << kClkBits) >> kClkBits); 115 nclk_ = tid_ + 1; 116 last_acquire_ = 0; 117 internal_memset(clk_, 0, sizeof(clk_)); 118} 119 120void ThreadClock::ResetCached(ClockCache *c) { 121 if (cached_idx_) { 122 UnrefClockBlock(c, cached_idx_, cached_blocks_); 123 cached_idx_ = 0; 124 cached_size_ = 0; 125 cached_blocks_ = 0; 126 } 127} 128 129void ThreadClock::acquire(ClockCache *c, SyncClock *src) { 130 DCHECK_LE(nclk_, kMaxTid); 131 DCHECK_LE(src->size_, kMaxTid); 132 CPP_STAT_INC(StatClockAcquire); 133 134 // Check if it's empty -> no need to do anything. 135 const uptr nclk = src->size_; 136 if (nclk == 0) { 137 CPP_STAT_INC(StatClockAcquireEmpty); 138 return; 139 } 140 141 bool acquired = false; 142 for (unsigned i = 0; i < kDirtyTids; i++) { 143 SyncClock::Dirty dirty = src->dirty_[i]; 144 unsigned tid = dirty.tid; 145 if (tid != kInvalidTid) { 146 if (clk_[tid] < dirty.epoch) { 147 clk_[tid] = dirty.epoch; 148 acquired = true; 149 } 150 } 151 } 152 153 // Check if we've already acquired src after the last release operation on src 154 if (tid_ >= nclk || src->elem(tid_).reused != reused_) { 155 // O(N) acquire. 156 CPP_STAT_INC(StatClockAcquireFull); 157 nclk_ = max(nclk_, nclk); 158 u64 *dst_pos = &clk_[0]; 159 for (ClockElem &src_elem : *src) { 160 u64 epoch = src_elem.epoch; 161 if (*dst_pos < epoch) { 162 *dst_pos = epoch; 163 acquired = true; 164 } 165 dst_pos++; 166 } 167 168 // Remember that this thread has acquired this clock. 169 if (nclk > tid_) 170 src->elem(tid_).reused = reused_; 171 } 172 173 if (acquired) { 174 CPP_STAT_INC(StatClockAcquiredSomething); 175 last_acquire_ = clk_[tid_]; 176 ResetCached(c); 177 } 178} 179 180void ThreadClock::release(ClockCache *c, SyncClock *dst) { 181 DCHECK_LE(nclk_, kMaxTid); 182 DCHECK_LE(dst->size_, kMaxTid); 183 184 if (dst->size_ == 0) { 185 // ReleaseStore will correctly set release_store_tid_, 186 // which can be important for future operations. 187 ReleaseStore(c, dst); 188 return; 189 } 190 191 CPP_STAT_INC(StatClockRelease); 192 // Check if we need to resize dst. 193 if (dst->size_ < nclk_) 194 dst->Resize(c, nclk_); 195 196 // Check if we had not acquired anything from other threads 197 // since the last release on dst. If so, we need to update 198 // only dst->elem(tid_). 199 if (dst->elem(tid_).epoch > last_acquire_) { 200 UpdateCurrentThread(c, dst); 201 if (dst->release_store_tid_ != tid_ || 202 dst->release_store_reused_ != reused_) 203 dst->release_store_tid_ = kInvalidTid; 204 return; 205 } 206 207 // O(N) release. 208 CPP_STAT_INC(StatClockReleaseFull); 209 dst->Unshare(c); 210 // First, remember whether we've acquired dst. 211 bool acquired = IsAlreadyAcquired(dst); 212 if (acquired) 213 CPP_STAT_INC(StatClockReleaseAcquired); 214 // Update dst->clk_. 215 dst->FlushDirty(); 216 uptr i = 0; 217 for (ClockElem &ce : *dst) { 218 ce.epoch = max(ce.epoch, clk_[i]); 219 ce.reused = 0; 220 i++; 221 } 222 // Clear 'acquired' flag in the remaining elements. 223 if (nclk_ < dst->size_) 224 CPP_STAT_INC(StatClockReleaseClearTail); 225 for (uptr i = nclk_; i < dst->size_; i++) 226 dst->elem(i).reused = 0; 227 dst->release_store_tid_ = kInvalidTid; 228 dst->release_store_reused_ = 0; 229 // If we've acquired dst, remember this fact, 230 // so that we don't need to acquire it on next acquire. 231 if (acquired) 232 dst->elem(tid_).reused = reused_; 233} 234 235void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) { 236 DCHECK_LE(nclk_, kMaxTid); 237 DCHECK_LE(dst->size_, kMaxTid); 238 CPP_STAT_INC(StatClockStore); 239 240 if (dst->size_ == 0 && cached_idx_ != 0) { 241 // Reuse the cached clock. 242 // Note: we could reuse/cache the cached clock in more cases: 243 // we could update the existing clock and cache it, or replace it with the 244 // currently cached clock and release the old one. And for a shared 245 // existing clock, we could replace it with the currently cached; 246 // or unshare, update and cache. But, for simplicity, we currnetly reuse 247 // cached clock only when the target clock is empty. 248 dst->tab_ = ctx->clock_alloc.Map(cached_idx_); 249 dst->tab_idx_ = cached_idx_; 250 dst->size_ = cached_size_; 251 dst->blocks_ = cached_blocks_; 252 CHECK_EQ(dst->dirty_[0].tid, kInvalidTid); 253 // The cached clock is shared (immutable), 254 // so this is where we store the current clock. 255 dst->dirty_[0].tid = tid_; 256 dst->dirty_[0].epoch = clk_[tid_]; 257 dst->release_store_tid_ = tid_; 258 dst->release_store_reused_ = reused_; 259 // Rememeber that we don't need to acquire it in future. 260 dst->elem(tid_).reused = reused_; 261 // Grab a reference. 262 atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed); 263 return; 264 } 265 266 // Check if we need to resize dst. 267 if (dst->size_ < nclk_) 268 dst->Resize(c, nclk_); 269 270 if (dst->release_store_tid_ == tid_ && 271 dst->release_store_reused_ == reused_ && 272 dst->elem(tid_).epoch > last_acquire_) { 273 CPP_STAT_INC(StatClockStoreFast); 274 UpdateCurrentThread(c, dst); 275 return; 276 } 277 278 // O(N) release-store. 279 CPP_STAT_INC(StatClockStoreFull); 280 dst->Unshare(c); 281 // Note: dst can be larger than this ThreadClock. 282 // This is fine since clk_ beyond size is all zeros. 283 uptr i = 0; 284 for (ClockElem &ce : *dst) { 285 ce.epoch = clk_[i]; 286 ce.reused = 0; 287 i++; 288 } 289 for (uptr i = 0; i < kDirtyTids; i++) 290 dst->dirty_[i].tid = kInvalidTid; 291 dst->release_store_tid_ = tid_; 292 dst->release_store_reused_ = reused_; 293 // Rememeber that we don't need to acquire it in future. 294 dst->elem(tid_).reused = reused_; 295 296 // If the resulting clock is cachable, cache it for future release operations. 297 // The clock is always cachable if we released to an empty sync object. 298 if (cached_idx_ == 0 && dst->Cachable()) { 299 // Grab a reference to the ClockBlock. 300 atomic_uint32_t *ref = ref_ptr(dst->tab_); 301 if (atomic_load(ref, memory_order_acquire) == 1) 302 atomic_store_relaxed(ref, 2); 303 else 304 atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed); 305 cached_idx_ = dst->tab_idx_; 306 cached_size_ = dst->size_; 307 cached_blocks_ = dst->blocks_; 308 } 309} 310 311void ThreadClock::acq_rel(ClockCache *c, SyncClock *dst) { 312 CPP_STAT_INC(StatClockAcquireRelease); 313 acquire(c, dst); 314 ReleaseStore(c, dst); 315} 316 317// Updates only single element related to the current thread in dst->clk_. 318void ThreadClock::UpdateCurrentThread(ClockCache *c, SyncClock *dst) const { 319 // Update the threads time, but preserve 'acquired' flag. 320 for (unsigned i = 0; i < kDirtyTids; i++) { 321 SyncClock::Dirty *dirty = &dst->dirty_[i]; 322 const unsigned tid = dirty->tid; 323 if (tid == tid_ || tid == kInvalidTid) { 324 CPP_STAT_INC(StatClockReleaseFast); 325 dirty->tid = tid_; 326 dirty->epoch = clk_[tid_]; 327 return; 328 } 329 } 330 // Reset all 'acquired' flags, O(N). 331 // We are going to touch dst elements, so we need to unshare it. 332 dst->Unshare(c); 333 CPP_STAT_INC(StatClockReleaseSlow); 334 dst->elem(tid_).epoch = clk_[tid_]; 335 for (uptr i = 0; i < dst->size_; i++) 336 dst->elem(i).reused = 0; 337 dst->FlushDirty(); 338} 339 340// Checks whether the current thread has already acquired src. 341bool ThreadClock::IsAlreadyAcquired(const SyncClock *src) const { 342 if (src->elem(tid_).reused != reused_) 343 return false; 344 for (unsigned i = 0; i < kDirtyTids; i++) { 345 SyncClock::Dirty dirty = src->dirty_[i]; 346 if (dirty.tid != kInvalidTid) { 347 if (clk_[dirty.tid] < dirty.epoch) 348 return false; 349 } 350 } 351 return true; 352} 353 354// Sets a single element in the vector clock. 355// This function is called only from weird places like AcquireGlobal. 356void ThreadClock::set(ClockCache *c, unsigned tid, u64 v) { 357 DCHECK_LT(tid, kMaxTid); 358 DCHECK_GE(v, clk_[tid]); 359 clk_[tid] = v; 360 if (nclk_ <= tid) 361 nclk_ = tid + 1; 362 last_acquire_ = clk_[tid_]; 363 ResetCached(c); 364} 365 366void ThreadClock::DebugDump(int(*printf)(const char *s, ...)) { 367 printf("clock=["); 368 for (uptr i = 0; i < nclk_; i++) 369 printf("%s%llu", i == 0 ? "" : ",", clk_[i]); 370 printf("] tid=%u/%u last_acq=%llu", tid_, reused_, last_acquire_); 371} 372 373SyncClock::SyncClock() { 374 ResetImpl(); 375} 376 377SyncClock::~SyncClock() { 378 // Reset must be called before dtor. 379 CHECK_EQ(size_, 0); 380 CHECK_EQ(blocks_, 0); 381 CHECK_EQ(tab_, 0); 382 CHECK_EQ(tab_idx_, 0); 383} 384 385void SyncClock::Reset(ClockCache *c) { 386 if (size_) 387 UnrefClockBlock(c, tab_idx_, blocks_); 388 ResetImpl(); 389} 390 391void SyncClock::ResetImpl() { 392 tab_ = 0; 393 tab_idx_ = 0; 394 size_ = 0; 395 blocks_ = 0; 396 release_store_tid_ = kInvalidTid; 397 release_store_reused_ = 0; 398 for (uptr i = 0; i < kDirtyTids; i++) 399 dirty_[i].tid = kInvalidTid; 400} 401 402void SyncClock::Resize(ClockCache *c, uptr nclk) { 403 CPP_STAT_INC(StatClockReleaseResize); 404 Unshare(c); 405 if (nclk <= capacity()) { 406 // Memory is already allocated, just increase the size. 407 size_ = nclk; 408 return; 409 } 410 if (size_ == 0) { 411 // Grow from 0 to one-level table. 412 CHECK_EQ(size_, 0); 413 CHECK_EQ(blocks_, 0); 414 CHECK_EQ(tab_, 0); 415 CHECK_EQ(tab_idx_, 0); 416 tab_idx_ = ctx->clock_alloc.Alloc(c); 417 tab_ = ctx->clock_alloc.Map(tab_idx_); 418 internal_memset(tab_, 0, sizeof(*tab_)); 419 atomic_store_relaxed(ref_ptr(tab_), 1); 420 size_ = 1; 421 } else if (size_ > blocks_ * ClockBlock::kClockCount) { 422 u32 idx = ctx->clock_alloc.Alloc(c); 423 ClockBlock *new_cb = ctx->clock_alloc.Map(idx); 424 uptr top = size_ - blocks_ * ClockBlock::kClockCount; 425 CHECK_LT(top, ClockBlock::kClockCount); 426 const uptr move = top * sizeof(tab_->clock[0]); 427 internal_memcpy(&new_cb->clock[0], tab_->clock, move); 428 internal_memset(&new_cb->clock[top], 0, sizeof(*new_cb) - move); 429 internal_memset(tab_->clock, 0, move); 430 append_block(idx); 431 } 432 // At this point we have first level table allocated and all clock elements 433 // are evacuated from it to a second level block. 434 // Add second level tables as necessary. 435 while (nclk > capacity()) { 436 u32 idx = ctx->clock_alloc.Alloc(c); 437 ClockBlock *cb = ctx->clock_alloc.Map(idx); 438 internal_memset(cb, 0, sizeof(*cb)); 439 append_block(idx); 440 } 441 size_ = nclk; 442} 443 444// Flushes all dirty elements into the main clock array. 445void SyncClock::FlushDirty() { 446 for (unsigned i = 0; i < kDirtyTids; i++) { 447 Dirty *dirty = &dirty_[i]; 448 if (dirty->tid != kInvalidTid) { 449 CHECK_LT(dirty->tid, size_); 450 elem(dirty->tid).epoch = dirty->epoch; 451 dirty->tid = kInvalidTid; 452 } 453 } 454} 455 456bool SyncClock::IsShared() const { 457 if (size_ == 0) 458 return false; 459 atomic_uint32_t *ref = ref_ptr(tab_); 460 u32 v = atomic_load(ref, memory_order_acquire); 461 CHECK_GT(v, 0); 462 return v > 1; 463} 464 465// Unshares the current clock if it's shared. 466// Shared clocks are immutable, so they need to be unshared before any updates. 467// Note: this does not apply to dirty entries as they are not shared. 468void SyncClock::Unshare(ClockCache *c) { 469 if (!IsShared()) 470 return; 471 // First, copy current state into old. 472 SyncClock old; 473 old.tab_ = tab_; 474 old.tab_idx_ = tab_idx_; 475 old.size_ = size_; 476 old.blocks_ = blocks_; 477 old.release_store_tid_ = release_store_tid_; 478 old.release_store_reused_ = release_store_reused_; 479 for (unsigned i = 0; i < kDirtyTids; i++) 480 old.dirty_[i] = dirty_[i]; 481 // Then, clear current object. 482 ResetImpl(); 483 // Allocate brand new clock in the current object. 484 Resize(c, old.size_); 485 // Now copy state back into this object. 486 Iter old_iter(&old); 487 for (ClockElem &ce : *this) { 488 ce = *old_iter; 489 ++old_iter; 490 } 491 release_store_tid_ = old.release_store_tid_; 492 release_store_reused_ = old.release_store_reused_; 493 for (unsigned i = 0; i < kDirtyTids; i++) 494 dirty_[i] = old.dirty_[i]; 495 // Drop reference to old and delete if necessary. 496 old.Reset(c); 497} 498 499// Can we cache this clock for future release operations? 500ALWAYS_INLINE bool SyncClock::Cachable() const { 501 if (size_ == 0) 502 return false; 503 for (unsigned i = 0; i < kDirtyTids; i++) { 504 if (dirty_[i].tid != kInvalidTid) 505 return false; 506 } 507 return atomic_load_relaxed(ref_ptr(tab_)) == 1; 508} 509 510// elem linearizes the two-level structure into linear array. 511// Note: this is used only for one time accesses, vector operations use 512// the iterator as it is much faster. 513ALWAYS_INLINE ClockElem &SyncClock::elem(unsigned tid) const { 514 DCHECK_LT(tid, size_); 515 const uptr block = tid / ClockBlock::kClockCount; 516 DCHECK_LE(block, blocks_); 517 tid %= ClockBlock::kClockCount; 518 if (block == blocks_) 519 return tab_->clock[tid]; 520 u32 idx = get_block(block); 521 ClockBlock *cb = ctx->clock_alloc.Map(idx); 522 return cb->clock[tid]; 523} 524 525ALWAYS_INLINE uptr SyncClock::capacity() const { 526 if (size_ == 0) 527 return 0; 528 uptr ratio = sizeof(ClockBlock::clock[0]) / sizeof(ClockBlock::table[0]); 529 // How many clock elements we can fit into the first level block. 530 // +1 for ref counter. 531 uptr top = ClockBlock::kClockCount - RoundUpTo(blocks_ + 1, ratio) / ratio; 532 return blocks_ * ClockBlock::kClockCount + top; 533} 534 535ALWAYS_INLINE u32 SyncClock::get_block(uptr bi) const { 536 DCHECK(size_); 537 DCHECK_LT(bi, blocks_); 538 return tab_->table[ClockBlock::kBlockIdx - bi]; 539} 540 541ALWAYS_INLINE void SyncClock::append_block(u32 idx) { 542 uptr bi = blocks_++; 543 CHECK_EQ(get_block(bi), 0); 544 tab_->table[ClockBlock::kBlockIdx - bi] = idx; 545} 546 547// Used only by tests. 548u64 SyncClock::get(unsigned tid) const { 549 for (unsigned i = 0; i < kDirtyTids; i++) { 550 Dirty dirty = dirty_[i]; 551 if (dirty.tid == tid) 552 return dirty.epoch; 553 } 554 return elem(tid).epoch; 555} 556 557// Used only by Iter test. 558u64 SyncClock::get_clean(unsigned tid) const { 559 return elem(tid).epoch; 560} 561 562void SyncClock::DebugDump(int(*printf)(const char *s, ...)) { 563 printf("clock=["); 564 for (uptr i = 0; i < size_; i++) 565 printf("%s%llu", i == 0 ? "" : ",", elem(i).epoch); 566 printf("] reused=["); 567 for (uptr i = 0; i < size_; i++) 568 printf("%s%llu", i == 0 ? "" : ",", elem(i).reused); 569 printf("] release_store_tid=%d/%d dirty_tids=%d[%llu]/%d[%llu]", 570 release_store_tid_, release_store_reused_, 571 dirty_[0].tid, dirty_[0].epoch, 572 dirty_[1].tid, dirty_[1].epoch); 573} 574 575void SyncClock::Iter::Next() { 576 // Finished with the current block, move on to the next one. 577 block_++; 578 if (block_ < parent_->blocks_) { 579 // Iterate over the next second level block. 580 u32 idx = parent_->get_block(block_); 581 ClockBlock *cb = ctx->clock_alloc.Map(idx); 582 pos_ = &cb->clock[0]; 583 end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount, 584 ClockBlock::kClockCount); 585 return; 586 } 587 if (block_ == parent_->blocks_ && 588 parent_->size_ > parent_->blocks_ * ClockBlock::kClockCount) { 589 // Iterate over elements in the first level block. 590 pos_ = &parent_->tab_->clock[0]; 591 end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount, 592 ClockBlock::kClockCount); 593 return; 594 } 595 parent_ = nullptr; // denotes end 596} 597} // namespace __tsan 598