tsan_clock.cpp revision 360784
1//===-- tsan_clock.cpp ----------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file is a part of ThreadSanitizer (TSan), a race detector.
10//
11//===----------------------------------------------------------------------===//
12#include "tsan_clock.h"
13#include "tsan_rtl.h"
14#include "sanitizer_common/sanitizer_placement_new.h"
15
16// SyncClock and ThreadClock implement vector clocks for sync variables
17// (mutexes, atomic variables, file descriptors, etc) and threads, respectively.
18// ThreadClock contains fixed-size vector clock for maximum number of threads.
19// SyncClock contains growable vector clock for currently necessary number of
20// threads.
21// Together they implement very simple model of operations, namely:
22//
23//   void ThreadClock::acquire(const SyncClock *src) {
24//     for (int i = 0; i < kMaxThreads; i++)
25//       clock[i] = max(clock[i], src->clock[i]);
26//   }
27//
28//   void ThreadClock::release(SyncClock *dst) const {
29//     for (int i = 0; i < kMaxThreads; i++)
30//       dst->clock[i] = max(dst->clock[i], clock[i]);
31//   }
32//
33//   void ThreadClock::ReleaseStore(SyncClock *dst) const {
34//     for (int i = 0; i < kMaxThreads; i++)
35//       dst->clock[i] = clock[i];
36//   }
37//
38//   void ThreadClock::acq_rel(SyncClock *dst) {
39//     acquire(dst);
40//     release(dst);
41//   }
42//
43// Conformance to this model is extensively verified in tsan_clock_test.cpp.
44// However, the implementation is significantly more complex. The complexity
45// allows to implement important classes of use cases in O(1) instead of O(N).
46//
47// The use cases are:
48// 1. Singleton/once atomic that has a single release-store operation followed
49//    by zillions of acquire-loads (the acquire-load is O(1)).
50// 2. Thread-local mutex (both lock and unlock can be O(1)).
51// 3. Leaf mutex (unlock is O(1)).
52// 4. A mutex shared by 2 threads (both lock and unlock can be O(1)).
53// 5. An atomic with a single writer (writes can be O(1)).
54// The implementation dynamically adopts to workload. So if an atomic is in
55// read-only phase, these reads will be O(1); if it later switches to read/write
56// phase, the implementation will correctly handle that by switching to O(N).
57//
58// Thread-safety note: all const operations on SyncClock's are conducted under
59// a shared lock; all non-const operations on SyncClock's are conducted under
60// an exclusive lock; ThreadClock's are private to respective threads and so
61// do not need any protection.
62//
63// Description of SyncClock state:
64// clk_ - variable size vector clock, low kClkBits hold timestamp,
65//   the remaining bits hold "acquired" flag (the actual value is thread's
66//   reused counter);
67//   if acquried == thr->reused_, then the respective thread has already
68//   acquired this clock (except possibly for dirty elements).
69// dirty_ - holds up to two indeces in the vector clock that other threads
70//   need to acquire regardless of "acquired" flag value;
71// release_store_tid_ - denotes that the clock state is a result of
72//   release-store operation by the thread with release_store_tid_ index.
73// release_store_reused_ - reuse count of release_store_tid_.
74
75// We don't have ThreadState in these methods, so this is an ugly hack that
76// works only in C++.
77#if !SANITIZER_GO
78# define CPP_STAT_INC(typ) StatInc(cur_thread(), typ)
79#else
80# define CPP_STAT_INC(typ) (void)0
81#endif
82
83namespace __tsan {
84
85static atomic_uint32_t *ref_ptr(ClockBlock *cb) {
86  return reinterpret_cast<atomic_uint32_t *>(&cb->table[ClockBlock::kRefIdx]);
87}
88
89// Drop reference to the first level block idx.
90static void UnrefClockBlock(ClockCache *c, u32 idx, uptr blocks) {
91  ClockBlock *cb = ctx->clock_alloc.Map(idx);
92  atomic_uint32_t *ref = ref_ptr(cb);
93  u32 v = atomic_load(ref, memory_order_acquire);
94  for (;;) {
95    CHECK_GT(v, 0);
96    if (v == 1)
97      break;
98    if (atomic_compare_exchange_strong(ref, &v, v - 1, memory_order_acq_rel))
99      return;
100  }
101  // First level block owns second level blocks, so them as well.
102  for (uptr i = 0; i < blocks; i++)
103    ctx->clock_alloc.Free(c, cb->table[ClockBlock::kBlockIdx - i]);
104  ctx->clock_alloc.Free(c, idx);
105}
106
107ThreadClock::ThreadClock(unsigned tid, unsigned reused)
108    : tid_(tid)
109    , reused_(reused + 1)  // 0 has special meaning
110    , cached_idx_()
111    , cached_size_()
112    , cached_blocks_() {
113  CHECK_LT(tid, kMaxTidInClock);
114  CHECK_EQ(reused_, ((u64)reused_ << kClkBits) >> kClkBits);
115  nclk_ = tid_ + 1;
116  last_acquire_ = 0;
117  internal_memset(clk_, 0, sizeof(clk_));
118}
119
120void ThreadClock::ResetCached(ClockCache *c) {
121  if (cached_idx_) {
122    UnrefClockBlock(c, cached_idx_, cached_blocks_);
123    cached_idx_ = 0;
124    cached_size_ = 0;
125    cached_blocks_ = 0;
126  }
127}
128
129void ThreadClock::acquire(ClockCache *c, SyncClock *src) {
130  DCHECK_LE(nclk_, kMaxTid);
131  DCHECK_LE(src->size_, kMaxTid);
132  CPP_STAT_INC(StatClockAcquire);
133
134  // Check if it's empty -> no need to do anything.
135  const uptr nclk = src->size_;
136  if (nclk == 0) {
137    CPP_STAT_INC(StatClockAcquireEmpty);
138    return;
139  }
140
141  bool acquired = false;
142  for (unsigned i = 0; i < kDirtyTids; i++) {
143    SyncClock::Dirty dirty = src->dirty_[i];
144    unsigned tid = dirty.tid;
145    if (tid != kInvalidTid) {
146      if (clk_[tid] < dirty.epoch) {
147        clk_[tid] = dirty.epoch;
148        acquired = true;
149      }
150    }
151  }
152
153  // Check if we've already acquired src after the last release operation on src
154  if (tid_ >= nclk || src->elem(tid_).reused != reused_) {
155    // O(N) acquire.
156    CPP_STAT_INC(StatClockAcquireFull);
157    nclk_ = max(nclk_, nclk);
158    u64 *dst_pos = &clk_[0];
159    for (ClockElem &src_elem : *src) {
160      u64 epoch = src_elem.epoch;
161      if (*dst_pos < epoch) {
162        *dst_pos = epoch;
163        acquired = true;
164      }
165      dst_pos++;
166    }
167
168    // Remember that this thread has acquired this clock.
169    if (nclk > tid_)
170      src->elem(tid_).reused = reused_;
171  }
172
173  if (acquired) {
174    CPP_STAT_INC(StatClockAcquiredSomething);
175    last_acquire_ = clk_[tid_];
176    ResetCached(c);
177  }
178}
179
180void ThreadClock::release(ClockCache *c, SyncClock *dst) {
181  DCHECK_LE(nclk_, kMaxTid);
182  DCHECK_LE(dst->size_, kMaxTid);
183
184  if (dst->size_ == 0) {
185    // ReleaseStore will correctly set release_store_tid_,
186    // which can be important for future operations.
187    ReleaseStore(c, dst);
188    return;
189  }
190
191  CPP_STAT_INC(StatClockRelease);
192  // Check if we need to resize dst.
193  if (dst->size_ < nclk_)
194    dst->Resize(c, nclk_);
195
196  // Check if we had not acquired anything from other threads
197  // since the last release on dst. If so, we need to update
198  // only dst->elem(tid_).
199  if (dst->elem(tid_).epoch > last_acquire_) {
200    UpdateCurrentThread(c, dst);
201    if (dst->release_store_tid_ != tid_ ||
202        dst->release_store_reused_ != reused_)
203      dst->release_store_tid_ = kInvalidTid;
204    return;
205  }
206
207  // O(N) release.
208  CPP_STAT_INC(StatClockReleaseFull);
209  dst->Unshare(c);
210  // First, remember whether we've acquired dst.
211  bool acquired = IsAlreadyAcquired(dst);
212  if (acquired)
213    CPP_STAT_INC(StatClockReleaseAcquired);
214  // Update dst->clk_.
215  dst->FlushDirty();
216  uptr i = 0;
217  for (ClockElem &ce : *dst) {
218    ce.epoch = max(ce.epoch, clk_[i]);
219    ce.reused = 0;
220    i++;
221  }
222  // Clear 'acquired' flag in the remaining elements.
223  if (nclk_ < dst->size_)
224    CPP_STAT_INC(StatClockReleaseClearTail);
225  for (uptr i = nclk_; i < dst->size_; i++)
226    dst->elem(i).reused = 0;
227  dst->release_store_tid_ = kInvalidTid;
228  dst->release_store_reused_ = 0;
229  // If we've acquired dst, remember this fact,
230  // so that we don't need to acquire it on next acquire.
231  if (acquired)
232    dst->elem(tid_).reused = reused_;
233}
234
235void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) {
236  DCHECK_LE(nclk_, kMaxTid);
237  DCHECK_LE(dst->size_, kMaxTid);
238  CPP_STAT_INC(StatClockStore);
239
240  if (dst->size_ == 0 && cached_idx_ != 0) {
241    // Reuse the cached clock.
242    // Note: we could reuse/cache the cached clock in more cases:
243    // we could update the existing clock and cache it, or replace it with the
244    // currently cached clock and release the old one. And for a shared
245    // existing clock, we could replace it with the currently cached;
246    // or unshare, update and cache. But, for simplicity, we currnetly reuse
247    // cached clock only when the target clock is empty.
248    dst->tab_ = ctx->clock_alloc.Map(cached_idx_);
249    dst->tab_idx_ = cached_idx_;
250    dst->size_ = cached_size_;
251    dst->blocks_ = cached_blocks_;
252    CHECK_EQ(dst->dirty_[0].tid, kInvalidTid);
253    // The cached clock is shared (immutable),
254    // so this is where we store the current clock.
255    dst->dirty_[0].tid = tid_;
256    dst->dirty_[0].epoch = clk_[tid_];
257    dst->release_store_tid_ = tid_;
258    dst->release_store_reused_ = reused_;
259    // Rememeber that we don't need to acquire it in future.
260    dst->elem(tid_).reused = reused_;
261    // Grab a reference.
262    atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed);
263    return;
264  }
265
266  // Check if we need to resize dst.
267  if (dst->size_ < nclk_)
268    dst->Resize(c, nclk_);
269
270  if (dst->release_store_tid_ == tid_ &&
271      dst->release_store_reused_ == reused_ &&
272      dst->elem(tid_).epoch > last_acquire_) {
273    CPP_STAT_INC(StatClockStoreFast);
274    UpdateCurrentThread(c, dst);
275    return;
276  }
277
278  // O(N) release-store.
279  CPP_STAT_INC(StatClockStoreFull);
280  dst->Unshare(c);
281  // Note: dst can be larger than this ThreadClock.
282  // This is fine since clk_ beyond size is all zeros.
283  uptr i = 0;
284  for (ClockElem &ce : *dst) {
285    ce.epoch = clk_[i];
286    ce.reused = 0;
287    i++;
288  }
289  for (uptr i = 0; i < kDirtyTids; i++)
290    dst->dirty_[i].tid = kInvalidTid;
291  dst->release_store_tid_ = tid_;
292  dst->release_store_reused_ = reused_;
293  // Rememeber that we don't need to acquire it in future.
294  dst->elem(tid_).reused = reused_;
295
296  // If the resulting clock is cachable, cache it for future release operations.
297  // The clock is always cachable if we released to an empty sync object.
298  if (cached_idx_ == 0 && dst->Cachable()) {
299    // Grab a reference to the ClockBlock.
300    atomic_uint32_t *ref = ref_ptr(dst->tab_);
301    if (atomic_load(ref, memory_order_acquire) == 1)
302      atomic_store_relaxed(ref, 2);
303    else
304      atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed);
305    cached_idx_ = dst->tab_idx_;
306    cached_size_ = dst->size_;
307    cached_blocks_ = dst->blocks_;
308  }
309}
310
311void ThreadClock::acq_rel(ClockCache *c, SyncClock *dst) {
312  CPP_STAT_INC(StatClockAcquireRelease);
313  acquire(c, dst);
314  ReleaseStore(c, dst);
315}
316
317// Updates only single element related to the current thread in dst->clk_.
318void ThreadClock::UpdateCurrentThread(ClockCache *c, SyncClock *dst) const {
319  // Update the threads time, but preserve 'acquired' flag.
320  for (unsigned i = 0; i < kDirtyTids; i++) {
321    SyncClock::Dirty *dirty = &dst->dirty_[i];
322    const unsigned tid = dirty->tid;
323    if (tid == tid_ || tid == kInvalidTid) {
324      CPP_STAT_INC(StatClockReleaseFast);
325      dirty->tid = tid_;
326      dirty->epoch = clk_[tid_];
327      return;
328    }
329  }
330  // Reset all 'acquired' flags, O(N).
331  // We are going to touch dst elements, so we need to unshare it.
332  dst->Unshare(c);
333  CPP_STAT_INC(StatClockReleaseSlow);
334  dst->elem(tid_).epoch = clk_[tid_];
335  for (uptr i = 0; i < dst->size_; i++)
336    dst->elem(i).reused = 0;
337  dst->FlushDirty();
338}
339
340// Checks whether the current thread has already acquired src.
341bool ThreadClock::IsAlreadyAcquired(const SyncClock *src) const {
342  if (src->elem(tid_).reused != reused_)
343    return false;
344  for (unsigned i = 0; i < kDirtyTids; i++) {
345    SyncClock::Dirty dirty = src->dirty_[i];
346    if (dirty.tid != kInvalidTid) {
347      if (clk_[dirty.tid] < dirty.epoch)
348        return false;
349    }
350  }
351  return true;
352}
353
354// Sets a single element in the vector clock.
355// This function is called only from weird places like AcquireGlobal.
356void ThreadClock::set(ClockCache *c, unsigned tid, u64 v) {
357  DCHECK_LT(tid, kMaxTid);
358  DCHECK_GE(v, clk_[tid]);
359  clk_[tid] = v;
360  if (nclk_ <= tid)
361    nclk_ = tid + 1;
362  last_acquire_ = clk_[tid_];
363  ResetCached(c);
364}
365
366void ThreadClock::DebugDump(int(*printf)(const char *s, ...)) {
367  printf("clock=[");
368  for (uptr i = 0; i < nclk_; i++)
369    printf("%s%llu", i == 0 ? "" : ",", clk_[i]);
370  printf("] tid=%u/%u last_acq=%llu", tid_, reused_, last_acquire_);
371}
372
373SyncClock::SyncClock() {
374  ResetImpl();
375}
376
377SyncClock::~SyncClock() {
378  // Reset must be called before dtor.
379  CHECK_EQ(size_, 0);
380  CHECK_EQ(blocks_, 0);
381  CHECK_EQ(tab_, 0);
382  CHECK_EQ(tab_idx_, 0);
383}
384
385void SyncClock::Reset(ClockCache *c) {
386  if (size_)
387    UnrefClockBlock(c, tab_idx_, blocks_);
388  ResetImpl();
389}
390
391void SyncClock::ResetImpl() {
392  tab_ = 0;
393  tab_idx_ = 0;
394  size_ = 0;
395  blocks_ = 0;
396  release_store_tid_ = kInvalidTid;
397  release_store_reused_ = 0;
398  for (uptr i = 0; i < kDirtyTids; i++)
399    dirty_[i].tid = kInvalidTid;
400}
401
402void SyncClock::Resize(ClockCache *c, uptr nclk) {
403  CPP_STAT_INC(StatClockReleaseResize);
404  Unshare(c);
405  if (nclk <= capacity()) {
406    // Memory is already allocated, just increase the size.
407    size_ = nclk;
408    return;
409  }
410  if (size_ == 0) {
411    // Grow from 0 to one-level table.
412    CHECK_EQ(size_, 0);
413    CHECK_EQ(blocks_, 0);
414    CHECK_EQ(tab_, 0);
415    CHECK_EQ(tab_idx_, 0);
416    tab_idx_ = ctx->clock_alloc.Alloc(c);
417    tab_ = ctx->clock_alloc.Map(tab_idx_);
418    internal_memset(tab_, 0, sizeof(*tab_));
419    atomic_store_relaxed(ref_ptr(tab_), 1);
420    size_ = 1;
421  } else if (size_ > blocks_ * ClockBlock::kClockCount) {
422    u32 idx = ctx->clock_alloc.Alloc(c);
423    ClockBlock *new_cb = ctx->clock_alloc.Map(idx);
424    uptr top = size_ - blocks_ * ClockBlock::kClockCount;
425    CHECK_LT(top, ClockBlock::kClockCount);
426    const uptr move = top * sizeof(tab_->clock[0]);
427    internal_memcpy(&new_cb->clock[0], tab_->clock, move);
428    internal_memset(&new_cb->clock[top], 0, sizeof(*new_cb) - move);
429    internal_memset(tab_->clock, 0, move);
430    append_block(idx);
431  }
432  // At this point we have first level table allocated and all clock elements
433  // are evacuated from it to a second level block.
434  // Add second level tables as necessary.
435  while (nclk > capacity()) {
436    u32 idx = ctx->clock_alloc.Alloc(c);
437    ClockBlock *cb = ctx->clock_alloc.Map(idx);
438    internal_memset(cb, 0, sizeof(*cb));
439    append_block(idx);
440  }
441  size_ = nclk;
442}
443
444// Flushes all dirty elements into the main clock array.
445void SyncClock::FlushDirty() {
446  for (unsigned i = 0; i < kDirtyTids; i++) {
447    Dirty *dirty = &dirty_[i];
448    if (dirty->tid != kInvalidTid) {
449      CHECK_LT(dirty->tid, size_);
450      elem(dirty->tid).epoch = dirty->epoch;
451      dirty->tid = kInvalidTid;
452    }
453  }
454}
455
456bool SyncClock::IsShared() const {
457  if (size_ == 0)
458    return false;
459  atomic_uint32_t *ref = ref_ptr(tab_);
460  u32 v = atomic_load(ref, memory_order_acquire);
461  CHECK_GT(v, 0);
462  return v > 1;
463}
464
465// Unshares the current clock if it's shared.
466// Shared clocks are immutable, so they need to be unshared before any updates.
467// Note: this does not apply to dirty entries as they are not shared.
468void SyncClock::Unshare(ClockCache *c) {
469  if (!IsShared())
470    return;
471  // First, copy current state into old.
472  SyncClock old;
473  old.tab_ = tab_;
474  old.tab_idx_ = tab_idx_;
475  old.size_ = size_;
476  old.blocks_ = blocks_;
477  old.release_store_tid_ = release_store_tid_;
478  old.release_store_reused_ = release_store_reused_;
479  for (unsigned i = 0; i < kDirtyTids; i++)
480    old.dirty_[i] = dirty_[i];
481  // Then, clear current object.
482  ResetImpl();
483  // Allocate brand new clock in the current object.
484  Resize(c, old.size_);
485  // Now copy state back into this object.
486  Iter old_iter(&old);
487  for (ClockElem &ce : *this) {
488    ce = *old_iter;
489    ++old_iter;
490  }
491  release_store_tid_ = old.release_store_tid_;
492  release_store_reused_ = old.release_store_reused_;
493  for (unsigned i = 0; i < kDirtyTids; i++)
494    dirty_[i] = old.dirty_[i];
495  // Drop reference to old and delete if necessary.
496  old.Reset(c);
497}
498
499// Can we cache this clock for future release operations?
500ALWAYS_INLINE bool SyncClock::Cachable() const {
501  if (size_ == 0)
502    return false;
503  for (unsigned i = 0; i < kDirtyTids; i++) {
504    if (dirty_[i].tid != kInvalidTid)
505      return false;
506  }
507  return atomic_load_relaxed(ref_ptr(tab_)) == 1;
508}
509
510// elem linearizes the two-level structure into linear array.
511// Note: this is used only for one time accesses, vector operations use
512// the iterator as it is much faster.
513ALWAYS_INLINE ClockElem &SyncClock::elem(unsigned tid) const {
514  DCHECK_LT(tid, size_);
515  const uptr block = tid / ClockBlock::kClockCount;
516  DCHECK_LE(block, blocks_);
517  tid %= ClockBlock::kClockCount;
518  if (block == blocks_)
519    return tab_->clock[tid];
520  u32 idx = get_block(block);
521  ClockBlock *cb = ctx->clock_alloc.Map(idx);
522  return cb->clock[tid];
523}
524
525ALWAYS_INLINE uptr SyncClock::capacity() const {
526  if (size_ == 0)
527    return 0;
528  uptr ratio = sizeof(ClockBlock::clock[0]) / sizeof(ClockBlock::table[0]);
529  // How many clock elements we can fit into the first level block.
530  // +1 for ref counter.
531  uptr top = ClockBlock::kClockCount - RoundUpTo(blocks_ + 1, ratio) / ratio;
532  return blocks_ * ClockBlock::kClockCount + top;
533}
534
535ALWAYS_INLINE u32 SyncClock::get_block(uptr bi) const {
536  DCHECK(size_);
537  DCHECK_LT(bi, blocks_);
538  return tab_->table[ClockBlock::kBlockIdx - bi];
539}
540
541ALWAYS_INLINE void SyncClock::append_block(u32 idx) {
542  uptr bi = blocks_++;
543  CHECK_EQ(get_block(bi), 0);
544  tab_->table[ClockBlock::kBlockIdx - bi] = idx;
545}
546
547// Used only by tests.
548u64 SyncClock::get(unsigned tid) const {
549  for (unsigned i = 0; i < kDirtyTids; i++) {
550    Dirty dirty = dirty_[i];
551    if (dirty.tid == tid)
552      return dirty.epoch;
553  }
554  return elem(tid).epoch;
555}
556
557// Used only by Iter test.
558u64 SyncClock::get_clean(unsigned tid) const {
559  return elem(tid).epoch;
560}
561
562void SyncClock::DebugDump(int(*printf)(const char *s, ...)) {
563  printf("clock=[");
564  for (uptr i = 0; i < size_; i++)
565    printf("%s%llu", i == 0 ? "" : ",", elem(i).epoch);
566  printf("] reused=[");
567  for (uptr i = 0; i < size_; i++)
568    printf("%s%llu", i == 0 ? "" : ",", elem(i).reused);
569  printf("] release_store_tid=%d/%d dirty_tids=%d[%llu]/%d[%llu]",
570      release_store_tid_, release_store_reused_,
571      dirty_[0].tid, dirty_[0].epoch,
572      dirty_[1].tid, dirty_[1].epoch);
573}
574
575void SyncClock::Iter::Next() {
576  // Finished with the current block, move on to the next one.
577  block_++;
578  if (block_ < parent_->blocks_) {
579    // Iterate over the next second level block.
580    u32 idx = parent_->get_block(block_);
581    ClockBlock *cb = ctx->clock_alloc.Map(idx);
582    pos_ = &cb->clock[0];
583    end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount,
584        ClockBlock::kClockCount);
585    return;
586  }
587  if (block_ == parent_->blocks_ &&
588      parent_->size_ > parent_->blocks_ * ClockBlock::kClockCount) {
589    // Iterate over elements in the first level block.
590    pos_ = &parent_->tab_->clock[0];
591    end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount,
592        ClockBlock::kClockCount);
593    return;
594  }
595  parent_ = nullptr;  // denotes end
596}
597}  // namespace __tsan
598