LSUnit.h revision 360784
1//===------------------------- LSUnit.h --------------------------*- C++-*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// A Load/Store unit class that models load/store queues and that implements
11/// a simple weak memory consistency model.
12///
13//===----------------------------------------------------------------------===//
14
15#ifndef LLVM_MCA_LSUNIT_H
16#define LLVM_MCA_LSUNIT_H
17
18#include "llvm/ADT/DenseMap.h"
19#include "llvm/ADT/SmallVector.h"
20#include "llvm/MC/MCSchedule.h"
21#include "llvm/MCA/HardwareUnits/HardwareUnit.h"
22#include "llvm/MCA/Instruction.h"
23
24namespace llvm {
25namespace mca {
26
27class Scheduler;
28
29/// A node of a memory dependency graph. A MemoryGroup describes a set of
30/// instructions with same memory dependencies.
31///
32/// By construction, instructions of a MemoryGroup don't depend on each other.
33/// At dispatch stage, instructions are mapped by the LSUnit to MemoryGroups.
34/// A Memory group identifier is then stored as a "token" in field
35/// Instruction::LSUTokenID of each dispatched instructions. That token is used
36/// internally by the LSUnit to track memory dependencies.
37class MemoryGroup {
38  unsigned NumPredecessors;
39  unsigned NumExecutingPredecessors;
40  unsigned NumExecutedPredecessors;
41
42  unsigned NumInstructions;
43  unsigned NumExecuting;
44  unsigned NumExecuted;
45  SmallVector<MemoryGroup *, 4> Succ;
46
47  CriticalDependency CriticalPredecessor;
48  InstRef CriticalMemoryInstruction;
49
50  MemoryGroup(const MemoryGroup &) = delete;
51  MemoryGroup &operator=(const MemoryGroup &) = delete;
52
53public:
54  MemoryGroup()
55      : NumPredecessors(0), NumExecutingPredecessors(0),
56        NumExecutedPredecessors(0), NumInstructions(0), NumExecuting(0),
57        NumExecuted(0), CriticalPredecessor(), CriticalMemoryInstruction() {}
58  MemoryGroup(MemoryGroup &&) = default;
59
60  ArrayRef<MemoryGroup *> getSuccessors() const { return Succ; }
61  unsigned getNumSuccessors() const { return Succ.size(); }
62  unsigned getNumPredecessors() const { return NumPredecessors; }
63  unsigned getNumExecutingPredecessors() const {
64    return NumExecutingPredecessors;
65  }
66  unsigned getNumExecutedPredecessors() const {
67    return NumExecutedPredecessors;
68  }
69  unsigned getNumInstructions() const { return NumInstructions; }
70  unsigned getNumExecuting() const { return NumExecuting; }
71  unsigned getNumExecuted() const { return NumExecuted; }
72
73  const InstRef &getCriticalMemoryInstruction() const {
74    return CriticalMemoryInstruction;
75  }
76  const CriticalDependency &getCriticalPredecessor() const {
77    return CriticalPredecessor;
78  }
79
80  void addSuccessor(MemoryGroup *Group) {
81    Group->NumPredecessors++;
82    assert(!isExecuted() && "Should have been removed!");
83    if (isExecuting())
84      Group->onGroupIssued(CriticalMemoryInstruction);
85    Succ.emplace_back(Group);
86  }
87
88  bool isWaiting() const {
89    return NumPredecessors >
90           (NumExecutingPredecessors + NumExecutedPredecessors);
91  }
92  bool isPending() const {
93    return NumExecutingPredecessors &&
94           ((NumExecutedPredecessors + NumExecutingPredecessors) ==
95            NumPredecessors);
96  }
97  bool isReady() const { return NumExecutedPredecessors == NumPredecessors; }
98  bool isExecuting() const {
99    return NumExecuting && (NumExecuting == (NumInstructions - NumExecuted));
100  }
101  bool isExecuted() const { return NumInstructions == NumExecuted; }
102
103  void onGroupIssued(const InstRef &IR) {
104    assert(!isReady() && "Unexpected group-start event!");
105    NumExecutingPredecessors++;
106
107    unsigned Cycles = IR.getInstruction()->getCyclesLeft();
108    if (CriticalPredecessor.Cycles < Cycles) {
109      CriticalPredecessor.IID = IR.getSourceIndex();
110      CriticalPredecessor.Cycles = Cycles;
111    }
112  }
113
114  void onGroupExecuted() {
115    assert(!isReady() && "Inconsistent state found!");
116    NumExecutingPredecessors--;
117    NumExecutedPredecessors++;
118  }
119
120  void onInstructionIssued(const InstRef &IR) {
121    assert(!isExecuting() && "Invalid internal state!");
122    ++NumExecuting;
123
124    // update the CriticalMemDep.
125    const Instruction &IS = *IR.getInstruction();
126    if ((bool)CriticalMemoryInstruction) {
127      const Instruction &OtherIS = *CriticalMemoryInstruction.getInstruction();
128      if (OtherIS.getCyclesLeft() < IS.getCyclesLeft())
129        CriticalMemoryInstruction = IR;
130    } else {
131      CriticalMemoryInstruction = IR;
132    }
133
134    if (!isExecuting())
135      return;
136
137    // Notify successors that this group started execution.
138    for (MemoryGroup *MG : Succ)
139      MG->onGroupIssued(CriticalMemoryInstruction);
140  }
141
142  void onInstructionExecuted() {
143    assert(isReady() && !isExecuted() && "Invalid internal state!");
144    --NumExecuting;
145    ++NumExecuted;
146
147    if (!isExecuted())
148      return;
149
150    // Notify successors that this group has finished execution.
151    for (MemoryGroup *MG : Succ)
152      MG->onGroupExecuted();
153  }
154
155  void addInstruction() {
156    assert(!getNumSuccessors() && "Cannot add instructions to this group!");
157    ++NumInstructions;
158  }
159
160  void cycleEvent() {
161    if (isWaiting() && CriticalPredecessor.Cycles)
162      CriticalPredecessor.Cycles--;
163  }
164};
165
166/// Abstract base interface for LS (load/store) units in llvm-mca.
167class LSUnitBase : public HardwareUnit {
168  /// Load queue size.
169  ///
170  /// A value of zero for this field means that the load queue is unbounded.
171  /// Processor models can declare the size of a load queue via tablegen (see
172  /// the definition of tablegen class LoadQueue in
173  /// llvm/Target/TargetSchedule.td).
174  unsigned LQSize;
175
176  /// Load queue size.
177  ///
178  /// A value of zero for this field means that the store queue is unbounded.
179  /// Processor models can declare the size of a store queue via tablegen (see
180  /// the definition of tablegen class StoreQueue in
181  /// llvm/Target/TargetSchedule.td).
182  unsigned SQSize;
183
184  unsigned UsedLQEntries;
185  unsigned UsedSQEntries;
186
187  /// True if loads don't alias with stores.
188  ///
189  /// By default, the LS unit assumes that loads and stores don't alias with
190  /// eachother. If this field is set to false, then loads are always assumed to
191  /// alias with stores.
192  const bool NoAlias;
193
194  /// Used to map group identifiers to MemoryGroups.
195  DenseMap<unsigned, std::unique_ptr<MemoryGroup>> Groups;
196  unsigned NextGroupID;
197
198public:
199  LSUnitBase(const MCSchedModel &SM, unsigned LoadQueueSize,
200             unsigned StoreQueueSize, bool AssumeNoAlias);
201
202  virtual ~LSUnitBase();
203
204  /// Returns the total number of entries in the load queue.
205  unsigned getLoadQueueSize() const { return LQSize; }
206
207  /// Returns the total number of entries in the store queue.
208  unsigned getStoreQueueSize() const { return SQSize; }
209
210  unsigned getUsedLQEntries() const { return UsedLQEntries; }
211  unsigned getUsedSQEntries() const { return UsedSQEntries; }
212  void acquireLQSlot() { ++UsedLQEntries; }
213  void acquireSQSlot() { ++UsedSQEntries; }
214  void releaseLQSlot() { --UsedLQEntries; }
215  void releaseSQSlot() { --UsedSQEntries; }
216
217  bool assumeNoAlias() const { return NoAlias; }
218
219  enum Status {
220    LSU_AVAILABLE = 0,
221    LSU_LQUEUE_FULL, // Load Queue unavailable
222    LSU_SQUEUE_FULL  // Store Queue unavailable
223  };
224
225  /// This method checks the availability of the load/store buffers.
226  ///
227  /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
228  /// accomodate instruction IR. By default, LSU_AVAILABLE is returned if IR is
229  /// not a memory operation.
230  virtual Status isAvailable(const InstRef &IR) const = 0;
231
232  /// Allocates LS resources for instruction IR.
233  ///
234  /// This method assumes that a previous call to `isAvailable(IR)` succeeded
235  /// with a LSUnitBase::Status value of LSU_AVAILABLE.
236  /// Returns the GroupID associated with this instruction. That value will be
237  /// used to set the LSUTokenID field in class Instruction.
238  virtual unsigned dispatch(const InstRef &IR) = 0;
239
240  bool isSQEmpty() const { return !UsedSQEntries; }
241  bool isLQEmpty() const { return !UsedLQEntries; }
242  bool isSQFull() const { return SQSize && SQSize == UsedSQEntries; }
243  bool isLQFull() const { return LQSize && LQSize == UsedLQEntries; }
244
245  bool isValidGroupID(unsigned Index) const {
246    return Index && (Groups.find(Index) != Groups.end());
247  }
248
249  /// Check if a peviously dispatched instruction IR is now ready for execution.
250  bool isReady(const InstRef &IR) const {
251    unsigned GroupID = IR.getInstruction()->getLSUTokenID();
252    const MemoryGroup &Group = getGroup(GroupID);
253    return Group.isReady();
254  }
255
256  /// Check if instruction IR only depends on memory instructions that are
257  /// currently executing.
258  bool isPending(const InstRef &IR) const {
259    unsigned GroupID = IR.getInstruction()->getLSUTokenID();
260    const MemoryGroup &Group = getGroup(GroupID);
261    return Group.isPending();
262  }
263
264  /// Check if instruction IR is still waiting on memory operations, and the
265  /// wait time is still unknown.
266  bool isWaiting(const InstRef &IR) const {
267    unsigned GroupID = IR.getInstruction()->getLSUTokenID();
268    const MemoryGroup &Group = getGroup(GroupID);
269    return Group.isWaiting();
270  }
271
272  bool hasDependentUsers(const InstRef &IR) const {
273    unsigned GroupID = IR.getInstruction()->getLSUTokenID();
274    const MemoryGroup &Group = getGroup(GroupID);
275    return !Group.isExecuted() && Group.getNumSuccessors();
276  }
277
278  const MemoryGroup &getGroup(unsigned Index) const {
279    assert(isValidGroupID(Index) && "Group doesn't exist!");
280    return *Groups.find(Index)->second;
281  }
282
283  MemoryGroup &getGroup(unsigned Index) {
284    assert(isValidGroupID(Index) && "Group doesn't exist!");
285    return *Groups.find(Index)->second;
286  }
287
288  unsigned createMemoryGroup() {
289    Groups.insert(
290        std::make_pair(NextGroupID, std::make_unique<MemoryGroup>()));
291    return NextGroupID++;
292  }
293
294  virtual void onInstructionExecuted(const InstRef &IR);
295
296  // Loads are tracked by the LDQ (load queue) from dispatch until completion.
297  // Stores are tracked by the STQ (store queue) from dispatch until commitment.
298  // By default we conservatively assume that the LDQ receives a load at
299  // dispatch. Loads leave the LDQ at retirement stage.
300  virtual void onInstructionRetired(const InstRef &IR);
301
302  virtual void onInstructionIssued(const InstRef &IR) {
303    unsigned GroupID = IR.getInstruction()->getLSUTokenID();
304    Groups[GroupID]->onInstructionIssued(IR);
305  }
306
307  virtual void cycleEvent();
308
309#ifndef NDEBUG
310  void dump() const;
311#endif
312};
313
314/// Default Load/Store Unit (LS Unit) for simulated processors.
315///
316/// Each load (or store) consumes one entry in the load (or store) queue.
317///
318/// Rules are:
319/// 1) A younger load is allowed to pass an older load only if there are no
320///    stores nor barriers in between the two loads.
321/// 2) An younger store is not allowed to pass an older store.
322/// 3) A younger store is not allowed to pass an older load.
323/// 4) A younger load is allowed to pass an older store only if the load does
324///    not alias with the store.
325///
326/// This class optimistically assumes that loads don't alias store operations.
327/// Under this assumption, younger loads are always allowed to pass older
328/// stores (this would only affects rule 4).
329/// Essentially, this class doesn't perform any sort alias analysis to
330/// identify aliasing loads and stores.
331///
332/// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be
333/// set to `false` by the constructor of LSUnit.
334///
335/// Note that this class doesn't know about the existence of different memory
336/// types for memory operations (example: write-through, write-combining, etc.).
337/// Derived classes are responsible for implementing that extra knowledge, and
338/// provide different sets of rules for loads and stores by overriding method
339/// `isReady()`.
340/// To emulate a write-combining memory type, rule 2. must be relaxed in a
341/// derived class to enable the reordering of non-aliasing store operations.
342///
343/// No assumptions are made by this class on the size of the store buffer.  This
344/// class doesn't know how to identify cases where store-to-load forwarding may
345/// occur.
346///
347/// LSUnit doesn't attempt to predict whether a load or store hits or misses
348/// the L1 cache. To be more specific, LSUnit doesn't know anything about
349/// cache hierarchy and memory types.
350/// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the
351/// scheduling model provides an "optimistic" load-to-use latency (which usually
352/// matches the load-to-use latency for when there is a hit in the L1D).
353/// Derived classes may expand this knowledge.
354///
355/// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
356/// memory-barrier like instructions.
357/// LSUnit conservatively assumes that an instruction which `mayLoad` and has
358/// `unmodeled side effects` behave like a "soft" load-barrier. That means, it
359/// serializes loads without forcing a flush of the load queue.
360/// Similarly, instructions that both `mayStore` and have `unmodeled side
361/// effects` are treated like store barriers. A full memory
362/// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side
363/// effects. This is obviously inaccurate, but this is the best that we can do
364/// at the moment.
365///
366/// Each load/store barrier consumes one entry in the load/store queue. A
367/// load/store barrier enforces ordering of loads/stores:
368///  - A younger load cannot pass a load barrier.
369///  - A younger store cannot pass a store barrier.
370///
371/// A younger load has to wait for the memory load barrier to execute.
372/// A load/store barrier is "executed" when it becomes the oldest entry in
373/// the load/store queue(s). That also means, all the older loads/stores have
374/// already been executed.
375class LSUnit : public LSUnitBase {
376  // This class doesn't know about the latency of a load instruction. So, it
377  // conservatively/pessimistically assumes that the latency of a load opcode
378  // matches the instruction latency.
379  //
380  // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
381  // and load/store conflicts, the latency of a load is determined by the depth
382  // of the load pipeline. So, we could use field `LoadLatency` in the
383  // MCSchedModel to model that latency.
384  // Field `LoadLatency` often matches the so-called 'load-to-use' latency from
385  // L1D, and it usually already accounts for any extra latency due to data
386  // forwarding.
387  // When doing throughput analysis, `LoadLatency` is likely to
388  // be a better predictor of load latency than instruction latency. This is
389  // particularly true when simulating code with temporal/spatial locality of
390  // memory accesses.
391  // Using `LoadLatency` (instead of the instruction latency) is also expected
392  // to improve the load queue allocation for long latency instructions with
393  // folded memory operands (See PR39829).
394  //
395  // FIXME: On some processors, load/store operations are split into multiple
396  // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
397  // not 256-bit data types. So, a 256-bit load is effectively split into two
398  // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
399  // simplicity, this class optimistically assumes that a load instruction only
400  // consumes one entry in the LoadQueue.  Similarly, store instructions only
401  // consume a single entry in the StoreQueue.
402  // In future, we should reassess the quality of this design, and consider
403  // alternative approaches that let instructions specify the number of
404  // load/store queue entries which they consume at dispatch stage (See
405  // PR39830).
406  //
407  // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is
408  // conservatively treated as a store barrier. It forces older store to be
409  // executed before newer stores are issued.
410  //
411  // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is
412  // conservatively treated as a load barrier. It forces older loads to execute
413  // before newer loads are issued.
414  unsigned CurrentLoadGroupID;
415  unsigned CurrentLoadBarrierGroupID;
416  unsigned CurrentStoreGroupID;
417
418public:
419  LSUnit(const MCSchedModel &SM)
420      : LSUnit(SM, /* LQSize */ 0, /* SQSize */ 0, /* NoAlias */ false) {}
421  LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ)
422      : LSUnit(SM, LQ, SQ, /* NoAlias */ false) {}
423  LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias)
424      : LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0),
425        CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0) {}
426
427  /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
428  /// accomodate instruction IR.
429  Status isAvailable(const InstRef &IR) const override;
430
431  /// Allocates LS resources for instruction IR.
432  ///
433  /// This method assumes that a previous call to `isAvailable(IR)` succeeded
434  /// returning LSU_AVAILABLE.
435  ///
436  /// Rules are:
437  /// By default, rules are:
438  /// 1. A store may not pass a previous store.
439  /// 2. A load may not pass a previous store unless flag 'NoAlias' is set.
440  /// 3. A load may pass a previous load.
441  /// 4. A store may not pass a previous load (regardless of flag 'NoAlias').
442  /// 5. A load has to wait until an older load barrier is fully executed.
443  /// 6. A store has to wait until an older store barrier is fully executed.
444  unsigned dispatch(const InstRef &IR) override;
445
446  void onInstructionExecuted(const InstRef &IR) override;
447};
448
449} // namespace mca
450} // namespace llvm
451
452#endif // LLVM_MCA_LSUNIT_H
453