X86ScheduleBtVer2.td revision 360784
1//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the machine model for AMD btver2 (Jaguar) to support
10// instruction scheduling and other instruction cost heuristics. Based off AMD Software
11// Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix.
12//
13//===----------------------------------------------------------------------===//
14
15def BtVer2Model : SchedMachineModel {
16  // All x86 instructions are modeled as a single micro-op, and btver2 can
17  // decode 2 instructions per cycle.
18  let IssueWidth = 2;
19  let MicroOpBufferSize = 64; // Retire Control Unit
20  let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency)
21  let HighLatency = 25;
22  let MispredictPenalty = 14; // Minimum branch misdirection penalty
23  let PostRAScheduler = 1;
24
25  // FIXME: SSE4/AVX is unimplemented. This flag is set to allow
26  // the scheduler to assign a default model to unrecognized opcodes.
27  let CompleteModel = 0;
28}
29
30let SchedModel = BtVer2Model in {
31
32// Jaguar can issue up to 6 micro-ops in one cycle
33def JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam)
34def JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV
35def JLAGU : ProcResource<1>; // Integer Pipe2: LAGU
36def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA)
37def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA
38def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM
39
40// The Integer PRF for Jaguar is 64 entries, and it holds the architectural and
41// speculative version of the 64-bit integer registers.
42// Reference: www.realworldtech.com/jaguar/4/
43//
44// The processor always keeps the different parts of an integer register
45// together. An instruction that writes to a part of a register will therefore
46// have a false dependence on any previous write to the same register or any
47// part of it.
48// Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register
49// access" - Agner Fog's "microarchitecture.pdf".
50def JIntegerPRF : RegisterFile<64, [GR64, CCR], [1, 1], [1, 0],
51                               0,  // Max moves that can be eliminated per cycle.
52                               1>; // Restrict move elimination to zero regs.
53
54// The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE
55// registers. Operations on 256-bit data types are cracked into two COPs.
56// Reference: www.realworldtech.com/jaguar/4/
57
58// The PRF in the floating point unit can eliminate a move from a MMX or SSE
59// register that is know to be zero (i.e. it has been zeroed using a zero-idiom
60// dependency breaking instruction, or via VZEROALL).
61// Reference: Section 21.8 "AMD Bobcat and Jaguar pipeline: Dependency-breaking
62// instructions" - Agner Fog's "microarchitecture.pdf"
63def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2], [1, 1, 0],
64                          0,  // Max moves that can be eliminated per cycle.
65                          1>; // Restrict move elimination to zero regs.
66
67// The retire control unit (RCU) can track up to 64 macro-ops in-flight. It can
68// retire up to two macro-ops per cycle.
69// Reference: "Software Optimization Guide for AMD Family 16h Processors"
70def JRCU : RetireControlUnit<64, 2>;
71
72// Integer Pipe Scheduler
73def JALU01 : ProcResGroup<[JALU0, JALU1]> {
74  let BufferSize=20;
75}
76
77// AGU Pipe Scheduler
78def JLSAGU : ProcResGroup<[JLAGU, JSAGU]> {
79  let BufferSize=12;
80}
81
82// Fpu Pipe Scheduler
83def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> {
84  let BufferSize=18;
85}
86
87// Functional units
88def JDiv    : ProcResource<1>; // integer division
89def JMul    : ProcResource<1>; // integer multiplication
90def JVALU0  : ProcResource<1>; // vector integer
91def JVALU1  : ProcResource<1>; // vector integer
92def JVIMUL  : ProcResource<1>; // vector integer multiplication
93def JSTC    : ProcResource<1>; // vector store/convert
94def JFPM    : ProcResource<1>; // FP multiplication
95def JFPA    : ProcResource<1>; // FP addition
96
97// Functional unit groups
98def JFPX  : ProcResGroup<[JFPA, JFPM]>;
99def JVALU : ProcResGroup<[JVALU0, JVALU1]>;
100
101// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
102// cycles after the memory operand.
103def : ReadAdvance<ReadAfterLd, 3>;
104
105// Vector loads are 5 cycles, so ReadAfterVec*Ld registers needn't be available until 5
106// cycles after the memory operand.
107def : ReadAdvance<ReadAfterVecLd, 5>;
108def : ReadAdvance<ReadAfterVecXLd, 5>;
109def : ReadAdvance<ReadAfterVecYLd, 5>;
110
111/// "Additional 6 cycle transfer operation which moves a floating point
112/// operation input value from the integer unit to the floating point unit.
113/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2).
114def : ReadAdvance<ReadInt2Fpu, -6>;
115
116// Many SchedWrites are defined in pairs with and without a folded load.
117// Instructions with folded loads are usually micro-fused, so they only appear
118// as two micro-ops when dispatched by the schedulers.
119// This multiclass defines the resource usage for variants with and without
120// folded loads.
121multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW,
122                            list<ProcResourceKind> ExePorts,
123                            int Lat, list<int> Res = [], int UOps = 1,
124                            int LoadUOps = 0> {
125  // Register variant is using a single cycle on ExePort.
126  def : WriteRes<SchedRW, ExePorts> {
127    let Latency = Lat;
128    let ResourceCycles = Res;
129    let NumMicroOps = UOps;
130  }
131
132  // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the
133  // latency.
134  def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
135    let Latency = !add(Lat, 3);
136    let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
137    let NumMicroOps = !add(UOps, LoadUOps);
138  }
139}
140
141multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW,
142                            list<ProcResourceKind> ExePorts,
143                            int Lat, list<int> Res = [], int UOps = 1,
144                            int LoadUOps = 0> {
145  // Register variant is using a single cycle on ExePort.
146  def : WriteRes<SchedRW, ExePorts> {
147    let Latency = Lat;
148    let ResourceCycles = Res;
149    let NumMicroOps = UOps;
150  }
151
152  // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the
153  // latency.
154  def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
155    let Latency = !add(Lat, 5);
156    let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
157    let NumMicroOps = !add(UOps, LoadUOps);
158  }
159}
160
161multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW,
162                            list<ProcResourceKind> ExePorts,
163                            int Lat, list<int> Res = [2], int UOps = 2,
164                            int LoadUOps = 0> {
165  // Register variant is using a single cycle on ExePort.
166  def : WriteRes<SchedRW, ExePorts> {
167    let Latency = Lat;
168    let ResourceCycles = Res;
169    let NumMicroOps = UOps;
170  }
171
172  // Memory variant also uses 2 cycles on JLAGU and adds 5 cycles to the
173  // latency.
174  def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> {
175    let Latency = !add(Lat, 5);
176    let ResourceCycles = !listconcat([2], Res);
177    let NumMicroOps = !add(UOps, LoadUOps);
178  }
179}
180
181// Instructions that have local forwarding disabled have an extra +1cy latency.
182
183// A folded store needs a cycle on the SAGU for the store data, most RMW
184// instructions don't need an extra uop.  ALU RMW operations don't seem to
185// benefit from STLF, and their observed latency is 6cy. That is the reason why
186// this write adds two extra cycles (instead of just 1cy for the store).
187defm : X86WriteRes<WriteRMW, [JSAGU], 2, [1], 0>;
188
189////////////////////////////////////////////////////////////////////////////////
190// Arithmetic.
191////////////////////////////////////////////////////////////////////////////////
192
193defm : JWriteResIntPair<WriteALU,    [JALU01], 1>;
194defm : JWriteResIntPair<WriteADC,    [JALU01], 1, [2]>;
195
196defm : X86WriteRes<WriteBSWAP32,     [JALU01], 1, [1], 1>;
197defm : X86WriteRes<WriteBSWAP64,     [JALU01], 1, [1], 1>;
198defm : X86WriteRes<WriteCMPXCHG,     [JALU01], 3, [3], 5>;
199defm : X86WriteRes<WriteCMPXCHGRMW,  [JALU01, JSAGU, JLAGU], 11, [3, 1, 1], 6>;
200defm : X86WriteRes<WriteXCHG,        [JALU01], 1, [2], 2>;
201
202defm : JWriteResIntPair<WriteIMul8,     [JALU1, JMul], 3, [1, 1], 1>;
203defm : JWriteResIntPair<WriteIMul16,    [JALU1, JMul], 3, [1, 3], 3>;
204defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 4, [1, 2], 2>;
205defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 1>;
206defm : JWriteResIntPair<WriteIMul32,    [JALU1, JMul], 3, [1, 2], 2>;
207defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 1>;
208defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 1>;
209defm : JWriteResIntPair<WriteIMul64,    [JALU1, JMul], 6, [1, 4], 2>;  
210defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 1>;
211defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 1>;
212defm : X86WriteRes<WriteIMulH,          [JALU1], 6, [4], 1>;
213
214defm : JWriteResIntPair<WriteDiv8,   [JALU1, JDiv], 12, [1, 12], 1>;
215defm : JWriteResIntPair<WriteDiv16,  [JALU1, JDiv], 17, [1, 17], 2>;
216defm : JWriteResIntPair<WriteDiv32,  [JALU1, JDiv], 25, [1, 25], 2>;
217defm : JWriteResIntPair<WriteDiv64,  [JALU1, JDiv], 41, [1, 41], 2>;
218defm : JWriteResIntPair<WriteIDiv8,  [JALU1, JDiv], 12, [1, 12], 1>;
219defm : JWriteResIntPair<WriteIDiv16, [JALU1, JDiv], 17, [1, 17], 2>;
220defm : JWriteResIntPair<WriteIDiv32, [JALU1, JDiv], 25, [1, 25], 2>;
221defm : JWriteResIntPair<WriteIDiv64, [JALU1, JDiv], 41, [1, 41], 2>;
222
223defm : JWriteResIntPair<WriteCRC32,  [JALU01], 3, [4], 3>;
224
225defm : JWriteResIntPair<WriteCMOV,  [JALU01], 1>; // Conditional move.
226defm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional move.
227def  : WriteRes<WriteSETCC, [JALU01]>; // Setcc.
228def  : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>;
229def  : WriteRes<WriteLAHFSAHF, [JALU01]>;
230
231defm : X86WriteRes<WriteBitTest,         [JALU01], 1, [1], 1>;
232defm : X86WriteRes<WriteBitTestImmLd,    [JALU01,JLAGU], 4, [1,1], 1>;
233defm : X86WriteRes<WriteBitTestRegLd,    [JALU01,JLAGU], 4, [1,1], 5>;
234defm : X86WriteRes<WriteBitTestSet,      [JALU01], 1, [1], 2>;
235defm : X86WriteRes<WriteBitTestSetImmLd, [JALU01,JLAGU], 4, [1,1], 4>;
236defm : X86WriteRes<WriteBitTestSetRegLd, [JALU01,JLAGU], 4, [1,1], 8>;
237
238// This is for simple LEAs with one or two input operands.
239def : WriteRes<WriteLEA, [JALU01]>;
240
241// Bit counts.
242defm : JWriteResIntPair<WriteBSF, [JALU01], 4, [8], 7>;
243defm : JWriteResIntPair<WriteBSR, [JALU01], 5, [8], 8>;
244defm : JWriteResIntPair<WritePOPCNT,         [JALU01], 1>;
245defm : JWriteResIntPair<WriteLZCNT,          [JALU01], 1>;
246defm : JWriteResIntPair<WriteTZCNT,          [JALU01], 2, [2], 2>;
247
248// BMI1 BEXTR/BLS, BMI2 BZHI
249defm : JWriteResIntPair<WriteBEXTR, [JALU01], 1>;
250defm : JWriteResIntPair<WriteBLS,   [JALU01], 2, [2], 2>;
251defm : X86WriteResPairUnsupported<WriteBZHI>;
252
253////////////////////////////////////////////////////////////////////////////////
254// Integer shifts and rotates.
255////////////////////////////////////////////////////////////////////////////////
256
257defm : JWriteResIntPair<WriteShift,    [JALU01], 1>;
258defm : JWriteResIntPair<WriteShiftCL,  [JALU01], 1>;
259defm : JWriteResIntPair<WriteRotate,   [JALU01], 1>;
260defm : JWriteResIntPair<WriteRotateCL, [JALU01], 1>;
261
262// SHLD/SHRD.
263defm : X86WriteRes<WriteSHDrri, [JALU01], 3, [6], 6>;
264defm : X86WriteRes<WriteSHDrrcl,[JALU01], 4, [8], 7>;
265defm : X86WriteRes<WriteSHDmri, [JLAGU, JALU01], 9, [1, 22], 8>;
266defm : X86WriteRes<WriteSHDmrcl,[JLAGU, JALU01], 9, [1, 22], 8>;
267
268////////////////////////////////////////////////////////////////////////////////
269// Loads, stores, and moves, not folded with other operations.
270////////////////////////////////////////////////////////////////////////////////
271
272def : WriteRes<WriteLoad,    [JLAGU]> { let Latency = 3; }
273def : WriteRes<WriteStore,   [JSAGU]>;
274def : WriteRes<WriteStoreNT, [JSAGU]>;
275def : WriteRes<WriteMove,    [JALU01]>;
276
277// Load/store MXCSR.
278def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 3; }
279def : WriteRes<WriteSTMXCSR, [JSAGU]>;
280
281// Treat misc copies as a move.
282def : InstRW<[WriteMove], (instrs COPY)>;
283
284////////////////////////////////////////////////////////////////////////////////
285// Idioms that clear a register, like xorps %xmm0, %xmm0.
286// These can often bypass execution ports completely.
287////////////////////////////////////////////////////////////////////////////////
288
289def : WriteRes<WriteZero,  []>;
290
291////////////////////////////////////////////////////////////////////////////////
292// Branches don't produce values, so they have no latency, but they still
293// consume resources. Indirect branches can fold loads.
294////////////////////////////////////////////////////////////////////////////////
295
296defm : JWriteResIntPair<WriteJump,  [JALU01], 1>;
297
298////////////////////////////////////////////////////////////////////////////////
299// Special case scheduling classes.
300////////////////////////////////////////////////////////////////////////////////
301
302def : WriteRes<WriteSystem,     [JALU01]> { let Latency = 100; }
303def : WriteRes<WriteMicrocoded, [JALU01]> { let Latency = 100; }
304def : WriteRes<WriteFence,  [JSAGU]>;
305
306// Nops don't have dependencies, so there's no actual latency, but we set this
307// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
308def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; }
309
310def JWriteCMPXCHG8rr : SchedWriteRes<[JALU01]> {
311  let Latency = 3;
312  let ResourceCycles = [3];
313  let NumMicroOps = 3;
314}
315
316def JWriteLOCK_CMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
317  let Latency = 16;
318  let ResourceCycles = [3,16,16];
319  let NumMicroOps = 5;
320}
321
322def JWriteLOCK_CMPXCHGrm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
323  let Latency = 17;
324  let ResourceCycles = [3,17,17];
325  let NumMicroOps = 6;
326}
327
328def JWriteCMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
329  let Latency = 11;
330  let ResourceCycles = [3,1,1];
331  let NumMicroOps = 5;
332}
333
334def JWriteCMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
335  let Latency = 11;
336  let ResourceCycles = [3,1,1];
337  let NumMicroOps = 18;
338}
339
340def JWriteCMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
341  let Latency = 32;
342  let ResourceCycles = [6,1,1];
343  let NumMicroOps = 28;
344}
345
346def JWriteLOCK_CMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
347  let Latency = 19;
348  let ResourceCycles = [3,19,19];
349  let NumMicroOps = 18;
350}
351
352def JWriteLOCK_CMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
353  let Latency = 38;
354  let ResourceCycles = [6,38,38];
355  let NumMicroOps = 28;
356}
357
358def JWriteCMPXCHGVariant :  SchedWriteVariant<[
359  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap8B>,  [JWriteLOCK_CMPXCHG8B]>,
360  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap16B>, [JWriteLOCK_CMPXCHG16B]>,
361  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap_8>,  [JWriteLOCK_CMPXCHG8rm]>,
362  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap>,    [JWriteLOCK_CMPXCHGrm]>,
363  SchedVar<MCSchedPredicate<IsCompareAndSwap8B>,        [JWriteCMPXCHG8B]>,
364  SchedVar<MCSchedPredicate<IsCompareAndSwap16B>,       [JWriteCMPXCHG16B]>,
365  SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap_8>,  [JWriteCMPXCHG8rm]>,
366  SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap>,    [WriteCMPXCHGRMW]>,
367  SchedVar<MCSchedPredicate<IsRegRegCompareAndSwap_8>,  [JWriteCMPXCHG8rr]>,
368  SchedVar<NoSchedPred,                                 [WriteCMPXCHG]>
369]>;
370
371// The first five reads are contributed by the memory load operand.
372// We ignore those reads and set a read-advance for the other input operands
373// including the implicit read of RAX.
374def : InstRW<[JWriteCMPXCHGVariant,
375              ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
376              ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8, LCMPXCHG16,
377                                                 LCMPXCHG32, LCMPXCHG64,
378                                                 CMPXCHG8rm, CMPXCHG16rm,
379                                                 CMPXCHG32rm, CMPXCHG64rm)>;
380
381def : InstRW<[JWriteCMPXCHGVariant], (instrs CMPXCHG8rr, CMPXCHG16rr,
382                                             CMPXCHG32rr, CMPXCHG64rr)>;
383
384def : InstRW<[JWriteCMPXCHGVariant,
385              // Ignore reads contributed by the memory operand.
386              ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
387              // Add a read-advance to every implicit register read.
388              ReadAfterLd, ReadAfterLd, ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8B, LCMPXCHG16B,
389                                                                           CMPXCHG8B, CMPXCHG16B)>;
390
391def JWriteLOCK_ALURMW : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
392  let Latency = 19;
393  let ResourceCycles = [1,19,19];
394  let NumMicroOps = 1;
395}
396
397def JWriteLOCK_ALURMWVariant :  SchedWriteVariant<[
398  SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_ALURMW]>,
399  SchedVar<NoSchedPred,                       [WriteALURMW]>
400]>;
401def : InstRW<[JWriteLOCK_ALURMWVariant], (instrs INC8m, INC16m, INC32m, INC64m,
402                                                 DEC8m, DEC16m, DEC32m, DEC64m,
403                                                 NOT8m, NOT16m, NOT32m, NOT64m,
404                                                 NEG8m, NEG16m, NEG32m, NEG64m)>;
405
406def JWriteXCHG8rr_XADDrr : SchedWriteRes<[JALU01]> {
407  let Latency = 2;
408  let ResourceCycles = [3];
409  let NumMicroOps = 3;
410}
411def : InstRW<[JWriteXCHG8rr_XADDrr], (instrs XCHG8rr, XADD8rr, XADD16rr,
412                                                      XADD32rr, XADD64rr)>;
413
414// This write defines the latency of the in/out register operand of a non-atomic
415// XADDrm. This is the first of a pair of writes that model non-atomic
416// XADDrm instructions (the second write definition is JWriteXADDrm_LdSt_Part).
417//
418// We need two writes because the instruction latency differs from the output
419// register operand latency. In particular, the first write describes the first
420// (and only) output register operand of the instruction.  However, the
421// instruction latency is set to the MAX of all the write latencies. That's why
422// a second write is needed in this case (see example below).
423//
424// Example:
425//     XADD %ecx, (%rsp)      ## Instruction latency: 11cy
426//                            ## ECX write Latency: 3cy
427//
428// Register ECX becomes available in 3 cycles. That is because the value of ECX
429// is exchanged with the value read from the stack pointer, and the load-to-use
430// latency is assumed to be 3cy.
431def JWriteXADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
432  let Latency = 3;  // load-to-use latency
433  let ResourceCycles = [3];
434  let NumMicroOps = 3;
435}
436
437// This write defines the latency of the in/out register operand of an atomic
438// XADDrm. This is the first of a sequence of two writes used to model atomic
439// XADD instructions. The second write of the sequence is JWriteXCHGrm_LdSt_Part.
440//
441//
442// Example:
443//    LOCK XADD %ecx, (%rsp)     ## Instruction Latency: 16cy
444//                               ## ECX write Latency: 11cy
445//
446// The value of ECX becomes available only after 11cy from the start of
447// execution. This write is used to specifically set that operand latency. 
448def JWriteLOCK_XADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
449  let Latency = 11;
450  let ResourceCycles = [3];
451  let NumMicroOps = 3;
452}
453
454// This write defines the latency of the in/out register operand of an atomic
455// XCHGrm. This write is the first of a sequence of two writes that describe
456// atomic XCHG operations. We need two writes because the instruction latency
457// differs from the output register write latency.  We want to make sure that
458// the output register operand becomes visible after 11cy. However, we want to
459// set the instruction latency to 16cy.
460def JWriteXCHGrm_XCHG_Part : SchedWriteRes<[JALU01]> {
461  let Latency = 11;
462  let ResourceCycles = [2];
463  let NumMicroOps = 2;
464}
465
466def JWriteXADDrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
467  let Latency = 11;
468  let ResourceCycles = [1, 1];
469  let NumMicroOps = 1;
470}
471
472def JWriteXCHGrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
473  let Latency = 16;
474  let ResourceCycles = [16, 16];
475  let NumMicroOps = 1;
476}
477
478def JWriteXADDrm_Part1 : SchedWriteVariant<[
479  SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_XADDrm_XCHG_Part]>,
480  SchedVar<NoSchedPred,                       [JWriteXADDrm_XCHG_Part]>
481]>;
482
483def JWriteXADDrm_Part2 : SchedWriteVariant<[
484  SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteXCHGrm_LdSt_Part]>,
485  SchedVar<NoSchedPred,                       [JWriteXADDrm_LdSt_Part]>
486]>;
487
488def : InstRW<[JWriteXADDrm_Part1, JWriteXADDrm_Part2, ReadAfterLd],
489                 (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm,
490                         LXADD8, LXADD16, LXADD32, LXADD64)>;
491
492def : InstRW<[JWriteXCHGrm_XCHG_Part, JWriteXCHGrm_LdSt_Part, ReadAfterLd],
493                 (instrs XCHG8rm, XCHG16rm, XCHG32rm, XCHG64rm)>;
494
495
496////////////////////////////////////////////////////////////////////////////////
497// Floating point. This covers both scalar and vector operations.
498////////////////////////////////////////////////////////////////////////////////
499
500defm : X86WriteRes<WriteFLD0,          [JFPU1, JSTC], 3, [1,1], 1>;
501defm : X86WriteRes<WriteFLD1,          [JFPU1, JSTC], 3, [1,1], 1>;
502defm : X86WriteRes<WriteFLDC,          [JFPU1, JSTC], 3, [1,1], 1>;
503defm : X86WriteRes<WriteFLoad,         [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
504defm : X86WriteRes<WriteFLoadX,        [JLAGU], 5, [1], 1>;
505defm : X86WriteRes<WriteFLoadY,        [JLAGU], 5, [2], 2>;
506defm : X86WriteRes<WriteFMaskedLoad,   [JLAGU, JFPU01, JFPX], 6, [1, 2, 2], 1>;
507defm : X86WriteRes<WriteFMaskedLoadY,  [JLAGU, JFPU01, JFPX], 6, [2, 4, 4], 2>;
508
509defm : X86WriteRes<WriteFStore,        [JSAGU, JFPU1,  JSTC], 2, [1, 1, 1], 1>;
510defm : X86WriteRes<WriteFStoreX,       [JSAGU, JFPU1,  JSTC], 1, [1, 1, 1], 1>;
511defm : X86WriteRes<WriteFStoreY,       [JSAGU, JFPU1,  JSTC], 1, [2, 2, 2], 2>;
512defm : X86WriteRes<WriteFStoreNT,      [JSAGU, JFPU1,  JSTC], 3, [1, 1, 1], 1>;
513defm : X86WriteRes<WriteFStoreNTX,     [JSAGU, JFPU1,  JSTC], 3, [1, 1, 1], 1>;
514defm : X86WriteRes<WriteFStoreNTY,     [JSAGU, JFPU1,  JSTC], 3, [2, 2, 2], 1>;
515
516defm : X86WriteRes<WriteFMaskedStore32,  [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 5, 5,4,4,4], 19>;
517defm : X86WriteRes<WriteFMaskedStore64,  [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 13, [1,1, 2, 2,2,2,2], 10>;
518defm : X86WriteRes<WriteFMaskedStore32Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 22, [1,1,10,10,8,8,8], 36>;
519defm : X86WriteRes<WriteFMaskedStore64Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 4, 4,4,4,4], 18>;
520
521defm : X86WriteRes<WriteFMove,         [JFPU01, JFPX], 1, [1, 1], 1>;
522defm : X86WriteRes<WriteFMoveX,        [JFPU01, JFPX], 1, [1, 1], 1>;
523defm : X86WriteRes<WriteFMoveY,        [JFPU01, JFPX], 1, [2, 2], 2>;
524
525defm : X86WriteRes<WriteEMMS,          [JFPU01, JFPX], 2, [1, 1], 1>;
526
527defm : JWriteResFpuPair<WriteFAdd,         [JFPU0, JFPA],  3>;
528defm : JWriteResFpuPair<WriteFAddX,        [JFPU0, JFPA],  3>;
529defm : JWriteResYMMPair<WriteFAddY,        [JFPU0, JFPA],  3, [2,2], 2>;
530defm : X86WriteResPairUnsupported<WriteFAddZ>;
531defm : JWriteResFpuPair<WriteFAdd64,       [JFPU0, JFPA],  3>;
532defm : JWriteResFpuPair<WriteFAdd64X,      [JFPU0, JFPA],  3>;
533defm : JWriteResYMMPair<WriteFAdd64Y,      [JFPU0, JFPA],  3, [2,2], 2>;
534defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
535defm : JWriteResFpuPair<WriteFCmp,         [JFPU0, JFPA],  2>;
536defm : JWriteResFpuPair<WriteFCmpX,        [JFPU0, JFPA],  2>;
537defm : JWriteResYMMPair<WriteFCmpY,        [JFPU0, JFPA],  2, [2,2], 2>;
538defm : X86WriteResPairUnsupported<WriteFCmpZ>;
539defm : JWriteResFpuPair<WriteFCmp64,       [JFPU0, JFPA],  2>;
540defm : JWriteResFpuPair<WriteFCmp64X,      [JFPU0, JFPA],  2>;
541defm : JWriteResYMMPair<WriteFCmp64Y,      [JFPU0, JFPA],  2, [2,2], 2>;
542defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
543defm : JWriteResFpuPair<WriteFCom,  [JFPU0, JFPA, JALU0],  3>;
544defm : JWriteResFpuPair<WriteFMul,         [JFPU1, JFPM],  2>;
545defm : JWriteResFpuPair<WriteFMulX,        [JFPU1, JFPM],  2>;
546defm : JWriteResYMMPair<WriteFMulY,        [JFPU1, JFPM],  2, [2,2], 2>;
547defm : X86WriteResPairUnsupported<WriteFMulZ>;
548defm : JWriteResFpuPair<WriteFMul64,       [JFPU1, JFPM],  4, [1,2]>;
549defm : JWriteResFpuPair<WriteFMul64X,      [JFPU1, JFPM],  4, [1,2]>;
550defm : JWriteResYMMPair<WriteFMul64Y,      [JFPU1, JFPM],  4, [2,4], 2>;
551defm : X86WriteResPairUnsupported<WriteFMul64Z>;
552defm : X86WriteResPairUnsupported<WriteFMA>;
553defm : X86WriteResPairUnsupported<WriteFMAX>;
554defm : X86WriteResPairUnsupported<WriteFMAY>;
555defm : X86WriteResPairUnsupported<WriteFMAZ>;
556defm : JWriteResFpuPair<WriteDPPD,   [JFPU1, JFPM, JFPA],  9, [1, 3, 3],  3>;
557defm : JWriteResFpuPair<WriteDPPS,   [JFPU1, JFPM, JFPA], 11, [1, 3, 3],  5>;
558defm : JWriteResYMMPair<WriteDPPSY,  [JFPU1, JFPM, JFPA], 12, [2, 6, 6], 10>;
559defm : X86WriteResPairUnsupported<WriteDPPSZ>;
560defm : JWriteResFpuPair<WriteFRcp,         [JFPU1, JFPM],  2>;
561defm : JWriteResFpuPair<WriteFRcpX,        [JFPU1, JFPM],  2>;
562defm : JWriteResYMMPair<WriteFRcpY,        [JFPU1, JFPM],  2, [2,2], 2>;
563defm : X86WriteResPairUnsupported<WriteFRcpZ>;
564defm : JWriteResFpuPair<WriteFRsqrt,       [JFPU1, JFPM],  2>;
565defm : JWriteResFpuPair<WriteFRsqrtX,      [JFPU1, JFPM],  2>;
566defm : JWriteResYMMPair<WriteFRsqrtY,      [JFPU1, JFPM],  2, [2,2], 2>;
567defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
568defm : JWriteResFpuPair<WriteFDiv,         [JFPU1, JFPM], 19, [1, 19]>;
569defm : JWriteResFpuPair<WriteFDivX,        [JFPU1, JFPM], 19, [1, 19]>;
570defm : JWriteResYMMPair<WriteFDivY,        [JFPU1, JFPM], 38, [2, 38], 2>;
571defm : X86WriteResPairUnsupported<WriteFDivZ>;
572defm : JWriteResFpuPair<WriteFDiv64,       [JFPU1, JFPM], 19, [1, 19]>;
573defm : JWriteResFpuPair<WriteFDiv64X,      [JFPU1, JFPM], 19, [1, 19]>;
574defm : JWriteResYMMPair<WriteFDiv64Y,      [JFPU1, JFPM], 38, [2, 38], 2>;
575defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
576defm : JWriteResFpuPair<WriteFSqrt,        [JFPU1, JFPM], 21, [1, 21]>;
577defm : JWriteResFpuPair<WriteFSqrtX,       [JFPU1, JFPM], 21, [1, 21]>;
578defm : JWriteResYMMPair<WriteFSqrtY,       [JFPU1, JFPM], 42, [2, 42], 2>;
579defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
580defm : JWriteResFpuPair<WriteFSqrt64,      [JFPU1, JFPM], 27, [1, 27]>;
581defm : JWriteResFpuPair<WriteFSqrt64X,     [JFPU1, JFPM], 27, [1, 27]>;
582defm : JWriteResYMMPair<WriteFSqrt64Y,     [JFPU1, JFPM], 54, [2, 54], 2>;
583defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
584defm : JWriteResFpuPair<WriteFSqrt80,      [JFPU1, JFPM], 35, [1, 35]>;
585defm : JWriteResFpuPair<WriteFSign,        [JFPU1, JFPM],  2>;
586defm : JWriteResFpuPair<WriteFRnd,         [JFPU1, JSTC],  3>;
587defm : JWriteResYMMPair<WriteFRndY,        [JFPU1, JSTC],  3, [2,2], 2>;
588defm : X86WriteResPairUnsupported<WriteFRndZ>;
589defm : JWriteResFpuPair<WriteFLogic,      [JFPU01, JFPX],  1>;
590defm : JWriteResYMMPair<WriteFLogicY,     [JFPU01, JFPX],  1, [2, 2], 2>;
591defm : X86WriteResPairUnsupported<WriteFLogicZ>;
592defm : JWriteResFpuPair<WriteFTest,       [JFPU0, JFPA, JALU0], 3>;
593defm : JWriteResYMMPair<WriteFTestY ,     [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
594defm : X86WriteResPairUnsupported<WriteFTestZ>;
595defm : JWriteResFpuPair<WriteFShuffle,    [JFPU01, JFPX],  1>;
596defm : JWriteResYMMPair<WriteFShuffleY,   [JFPU01, JFPX],  1, [2, 2], 2>;
597defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
598defm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX],  3, [1, 4], 3>; // +1cy latency.
599defm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX],  4, [2, 6], 6>; // +1cy latency.
600defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
601defm : JWriteResFpuPair<WriteFBlend,      [JFPU01, JFPX],  1>;
602defm : JWriteResYMMPair<WriteFBlendY,     [JFPU01, JFPX],  1, [2, 2], 2>;
603defm : X86WriteResPairUnsupported<WriteFBlendZ>;
604defm : JWriteResFpuPair<WriteFVarBlend,   [JFPU01, JFPX],  2, [4, 4], 3>;
605defm : JWriteResYMMPair<WriteFVarBlendY,  [JFPU01, JFPX],  3, [6, 6], 6>;
606defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
607defm : JWriteResFpuPair<WriteFShuffle256, [JFPU01, JFPX],  1, [2, 2], 2>;
608defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
609
610////////////////////////////////////////////////////////////////////////////////
611// Conversions.
612////////////////////////////////////////////////////////////////////////////////
613
614defm : JWriteResFpuPair<WriteCvtSS2I,      [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>;
615defm : JWriteResFpuPair<WriteCvtPS2I,      [JFPU1, JSTC], 3, [1,1], 1>;
616defm : JWriteResYMMPair<WriteCvtPS2IY,     [JFPU1, JSTC], 3, [2,2], 2>;
617defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
618defm : JWriteResFpuPair<WriteCvtSD2I,      [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>;
619defm : JWriteResFpuPair<WriteCvtPD2I,      [JFPU1, JSTC], 3, [1,1], 1>;
620defm : JWriteResYMMPair<WriteCvtPD2IY,     [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
621defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
622
623defm : X86WriteRes<WriteCvtI2SS,           [JFPU1, JSTC], 4, [1,1], 2>;
624defm : X86WriteRes<WriteCvtI2SSLd,         [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>;
625defm : JWriteResFpuPair<WriteCvtI2PS,      [JFPU1, JSTC], 3, [1,1], 1>;
626defm : JWriteResYMMPair<WriteCvtI2PSY,     [JFPU1, JSTC], 3, [2,2], 2>;
627defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
628defm : X86WriteRes<WriteCvtI2SD,           [JFPU1, JSTC], 4, [1,1], 2>;
629defm : X86WriteRes<WriteCvtI2SDLd,         [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>;
630defm : JWriteResFpuPair<WriteCvtI2PD,      [JFPU1, JSTC], 3, [1,1], 1>;
631defm : JWriteResYMMPair<WriteCvtI2PDY,     [JFPU1, JSTC], 3, [2,2], 2>;
632defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
633
634defm : JWriteResFpuPair<WriteCvtSS2SD,      [JFPU1, JSTC], 7, [1,2], 2>;
635defm : JWriteResFpuPair<WriteCvtPS2PD,      [JFPU1, JSTC], 2, [1,1], 1>;
636defm : JWriteResYMMPair<WriteCvtPS2PDY,     [JFPU1, JSTC], 2, [2,2], 2>;
637defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
638
639defm : JWriteResFpuPair<WriteCvtSD2SS,    [JFPU1, JSTC], 7, [1,2], 2>;
640defm : JWriteResFpuPair<WriteCvtPD2PS,    [JFPU1, JSTC], 3, [1,1], 1>;
641defm : JWriteResYMMPair<WriteCvtPD2PSY,   [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>;
642defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
643
644defm : JWriteResFpuPair<WriteCvtPH2PS,     [JFPU1, JSTC], 3, [1,1], 1>;
645defm : JWriteResYMMPair<WriteCvtPH2PSY,    [JFPU1, JSTC], 3, [2,2], 2>;
646defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
647
648defm : X86WriteRes<WriteCvtPS2PH,                 [JFPU1, JSTC], 3, [1,1], 1>;
649defm : X86WriteRes<WriteCvtPS2PHY,          [JFPU1, JSTC, JFPX], 6, [2,2,2], 3>;
650defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
651defm : X86WriteRes<WriteCvtPS2PHSt,        [JFPU1, JSTC, JSAGU], 4, [1,1,1], 1>;
652defm : X86WriteRes<WriteCvtPS2PHYSt, [JFPU1, JSTC, JFPX, JSAGU], 7, [2,2,2,1], 3>;
653defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
654
655////////////////////////////////////////////////////////////////////////////////
656// Vector integer operations.
657////////////////////////////////////////////////////////////////////////////////
658
659defm : X86WriteRes<WriteVecLoad,          [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
660defm : X86WriteRes<WriteVecLoadX,         [JLAGU], 5, [1], 1>;
661defm : X86WriteRes<WriteVecLoadY,         [JLAGU], 5, [2], 2>;
662defm : X86WriteRes<WriteVecLoadNT,        [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
663defm : X86WriteRes<WriteVecLoadNTY,       [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
664defm : X86WriteRes<WriteVecMaskedLoad,    [JLAGU, JFPU01, JVALU], 6, [1, 2, 2], 1>;
665defm : X86WriteRes<WriteVecMaskedLoadY,   [JLAGU, JFPU01, JVALU], 6, [2, 4, 4], 2>;
666
667defm : X86WriteRes<WriteVecStore,         [JSAGU, JFPU1,   JSTC], 2, [1, 1, 1], 1>;
668defm : X86WriteRes<WriteVecStoreX,        [JSAGU, JFPU1,   JSTC], 1, [1, 1, 1], 1>;
669defm : X86WriteRes<WriteVecStoreY,        [JSAGU, JFPU1,   JSTC], 1, [2, 2, 2], 2>;
670defm : X86WriteRes<WriteVecStoreNT,       [JSAGU, JFPU1,   JSTC], 2, [1, 1, 1], 1>;
671defm : X86WriteRes<WriteVecStoreNTY,      [JSAGU, JFPU1,   JSTC], 2, [2, 2, 2], 1>;
672defm : X86WriteRes<WriteVecMaskedStore,   [JSAGU, JFPU01, JVALU], 6, [1, 1, 4], 1>;
673defm : X86WriteRes<WriteVecMaskedStoreY,  [JSAGU, JFPU01, JVALU], 6, [2, 2, 4], 2>;
674
675defm : X86WriteRes<WriteVecMove,          [JFPU01, JVALU], 1, [1, 1], 1>;
676defm : X86WriteRes<WriteVecMoveX,         [JFPU01, JVALU], 1, [1, 1], 1>;
677defm : X86WriteRes<WriteVecMoveY,         [JFPU01, JVALU], 1, [2, 2], 2>;
678defm : X86WriteRes<WriteVecMoveToGpr,     [JFPU0, JFPA, JALU0], 4, [1, 1, 1], 1>;
679defm : X86WriteRes<WriteVecMoveFromGpr,   [JFPU01, JFPX], 8, [1, 1], 2>;
680
681defm : JWriteResFpuPair<WriteVecALU,      [JFPU01, JVALU], 1>;
682defm : JWriteResFpuPair<WriteVecALUX,     [JFPU01, JVALU], 1>;
683defm : X86WriteResPairUnsupported<WriteVecALUY>;
684defm : X86WriteResPairUnsupported<WriteVecALUZ>;
685defm : JWriteResFpuPair<WriteVecShift,    [JFPU01, JVALU], 1>;
686defm : JWriteResFpuPair<WriteVecShiftX,   [JFPU01, JVALU], 2>; // +1cy latency.
687defm : X86WriteResPairUnsupported<WriteVecShiftY>;
688defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
689defm : JWriteResFpuPair<WriteVecShiftImm, [JFPU01, JVALU], 1>;
690defm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 2>; // +1cy latency.
691defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
692defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
693defm : X86WriteResPairUnsupported<WriteVarVecShift>;
694defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
695defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
696defm : JWriteResFpuPair<WriteVecIMul,     [JFPU0, JVIMUL], 2>;
697defm : JWriteResFpuPair<WriteVecIMulX,    [JFPU0, JVIMUL], 2>;
698defm : X86WriteResPairUnsupported<WriteVecIMulY>;
699defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
700defm : JWriteResFpuPair<WritePMULLD,      [JFPU0, JFPU01, JVIMUL, JVALU], 4, [2, 1, 2, 1], 3>;
701defm : X86WriteResPairUnsupported<WritePMULLDY>;
702defm : X86WriteResPairUnsupported<WritePMULLDZ>;
703defm : JWriteResFpuPair<WriteMPSAD,       [JFPU0, JVIMUL], 3, [1, 2], 3>;
704defm : X86WriteResPairUnsupported<WriteMPSADY>;
705defm : X86WriteResPairUnsupported<WriteMPSADZ>;
706defm : JWriteResFpuPair<WritePSADBW,      [JFPU01, JVALU], 2>;
707defm : JWriteResFpuPair<WritePSADBWX,     [JFPU01, JVALU], 2>;
708defm : X86WriteResPairUnsupported<WritePSADBWY>;
709defm : X86WriteResPairUnsupported<WritePSADBWZ>;
710defm : JWriteResFpuPair<WritePHMINPOS,    [JFPU01, JVALU], 2>;
711defm : JWriteResFpuPair<WriteShuffle,     [JFPU01, JVALU], 1>;
712defm : JWriteResFpuPair<WriteShuffleX,    [JFPU01, JVALU], 1>;
713defm : X86WriteResPairUnsupported<WriteShuffleY>;
714defm : X86WriteResPairUnsupported<WriteShuffleZ>;
715defm : JWriteResFpuPair<WriteVarShuffle,  [JFPU01, JVALU], 2, [1, 1], 1>;
716defm : JWriteResFpuPair<WriteVarShuffleX, [JFPU01, JVALU], 2, [1, 4], 3>;
717defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
718defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
719defm : JWriteResFpuPair<WriteBlend,       [JFPU01, JVALU], 1>;
720defm : X86WriteResPairUnsupported<WriteBlendY>;
721defm : X86WriteResPairUnsupported<WriteBlendZ>;
722defm : JWriteResFpuPair<WriteVarBlend,    [JFPU01, JVALU], 2, [4, 4], 3>;
723defm : X86WriteResPairUnsupported<WriteVarBlendY>;
724defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
725defm : JWriteResFpuPair<WriteVecLogic,    [JFPU01, JVALU], 1>;
726defm : JWriteResFpuPair<WriteVecLogicX,   [JFPU01, JVALU], 1>;
727defm : X86WriteResPairUnsupported<WriteVecLogicY>;
728defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
729defm : JWriteResFpuPair<WriteVecTest,     [JFPU0, JFPA, JALU0], 3>;
730defm : JWriteResYMMPair<WriteVecTestY,    [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>;
731defm : X86WriteResPairUnsupported<WriteVecTestZ>;
732defm : X86WriteResPairUnsupported<WriteShuffle256>;
733defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
734
735////////////////////////////////////////////////////////////////////////////////
736// Vector insert/extract operations.
737////////////////////////////////////////////////////////////////////////////////
738
739defm : X86WriteRes<WriteVecInsert,      [JFPU01, JVALU], 1, [1,1], 2>;
740defm : X86WriteRes<WriteVecInsertLd,    [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>;
741defm : X86WriteRes<WriteVecExtract,     [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>;
742defm : X86WriteRes<WriteVecExtractSt,   [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>;
743
744////////////////////////////////////////////////////////////////////////////////
745// SSE42 String instructions.
746////////////////////////////////////////////////////////////////////////////////
747
748defm : JWriteResFpuPair<WritePCmpIStrI, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 7, [2, 2, 1, 1, 1], 3>;
749defm : JWriteResFpuPair<WritePCmpIStrM, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 8, [2, 2, 1, 1, 1], 3>;
750defm : JWriteResFpuPair<WritePCmpEStrI, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>;
751defm : JWriteResFpuPair<WritePCmpEStrM, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>;
752
753////////////////////////////////////////////////////////////////////////////////
754// MOVMSK Instructions.
755////////////////////////////////////////////////////////////////////////////////
756
757def  : WriteRes<WriteFMOVMSK,    [JFPU0, JFPA, JALU0]> { let Latency = 3; }
758def  : WriteRes<WriteVecMOVMSK,  [JFPU0, JFPA, JALU0]> { let Latency = 3; }
759defm : X86WriteResUnsupported<WriteVecMOVMSKY>;
760def  : WriteRes<WriteMMXMOVMSK,  [JFPU0, JFPA, JALU0]> { let Latency = 3; }
761
762////////////////////////////////////////////////////////////////////////////////
763// AES Instructions.
764////////////////////////////////////////////////////////////////////////////////
765
766defm : JWriteResFpuPair<WriteAESIMC,      [JFPU0, JVIMUL], 2>;
767defm : JWriteResFpuPair<WriteAESKeyGen,   [JFPU0, JVIMUL], 2>;
768defm : JWriteResFpuPair<WriteAESDecEnc,   [JFPU01, JVALU, JFPU0, JVIMUL], 3, [1,1,1,1], 2>;
769
770////////////////////////////////////////////////////////////////////////////////
771// Horizontal add/sub  instructions.
772////////////////////////////////////////////////////////////////////////////////
773
774defm : JWriteResFpuPair<WriteFHAdd,         [JFPU0, JFPA], 4>;            // +1cy latency.
775defm : JWriteResYMMPair<WriteFHAddY,        [JFPU0, JFPA], 4, [2,2], 2>;  // +1cy latency.
776defm : JWriteResFpuPair<WritePHAdd,         [JFPU01, JVALU], 1>;
777defm : JWriteResFpuPair<WritePHAddX,        [JFPU01, JVALU], 2>;          // +1cy latency.
778defm : X86WriteResPairUnsupported<WritePHAddY>;
779
780////////////////////////////////////////////////////////////////////////////////
781// Carry-less multiplication instructions.
782////////////////////////////////////////////////////////////////////////////////
783
784defm : JWriteResFpuPair<WriteCLMul,       [JFPU0, JVIMUL], 2>;
785
786////////////////////////////////////////////////////////////////////////////////
787// SSE4A instructions.
788////////////////////////////////////////////////////////////////////////////////
789
790def JWriteINSERTQ: SchedWriteRes<[JFPU01, JVALU]> {
791  let Latency = 2;
792  let ResourceCycles = [1, 4];
793}
794def : InstRW<[JWriteINSERTQ], (instrs INSERTQ, INSERTQI)>;
795
796////////////////////////////////////////////////////////////////////////////////
797// AVX instructions.
798////////////////////////////////////////////////////////////////////////////////
799
800def JWriteVecExtractF128: SchedWriteRes<[JFPU01, JFPX]>;
801def : InstRW<[JWriteVecExtractF128], (instrs VEXTRACTF128rr)>;
802
803def JWriteVBROADCASTYLd: SchedWriteRes<[JLAGU, JFPU01, JFPX]> {
804  let Latency = 6;
805  let ResourceCycles = [1, 2, 4];
806  let NumMicroOps = 2;
807}
808def : InstRW<[JWriteVBROADCASTYLd], (instrs VBROADCASTSDYrm,
809                                            VBROADCASTSSYrm,
810                                            VBROADCASTF128)>;
811
812def JWriteJVZEROALL: SchedWriteRes<[]> {
813  let Latency = 90;
814  let NumMicroOps = 73;
815}
816def : InstRW<[JWriteJVZEROALL], (instrs VZEROALL)>;
817
818def JWriteJVZEROUPPER: SchedWriteRes<[]> {
819  let Latency = 46;
820  let NumMicroOps = 37;
821}
822def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>;
823
824///////////////////////////////////////////////////////////////////////////////
825//  SSE2/AVX Store Selected Bytes of Double Quadword - (V)MASKMOVDQ
826///////////////////////////////////////////////////////////////////////////////
827
828def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01]> {
829  let Latency = 34;
830  let ResourceCycles = [1, 1, 2, 2, 2, 16, 42];
831  let NumMicroOps = 63;
832}
833def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64,
834                                         VMASKMOVDQU, VMASKMOVDQU64)>;
835
836///////////////////////////////////////////////////////////////////////////////
837//  SchedWriteVariant definitions.
838///////////////////////////////////////////////////////////////////////////////
839
840def JWriteZeroLatency : SchedWriteRes<[]> {
841  let Latency = 0;
842}
843
844def JWriteZeroIdiomYmm : SchedWriteRes<[JFPU01, JFPX]> {
845  let NumMicroOps = 2;
846}
847
848// Certain instructions that use the same register for both source
849// operands do not have a real dependency on the previous contents of the
850// register, and thus, do not have to wait before completing. They can be
851// optimized out at register renaming stage.
852// Reference: Section 10.8 of the "Software Optimization Guide for AMD Family
853// 15h Processors".
854// Reference: Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
855// Section 21.8 [Dependency-breaking instructions].
856
857def JWriteZeroIdiom : SchedWriteVariant<[
858    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
859    SchedVar<NoSchedPred,                          [WriteALU]>
860]>;
861def : InstRW<[JWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
862                                        XOR32rr, XOR64rr)>;
863
864def JWriteFZeroIdiom : SchedWriteVariant<[
865    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
866    SchedVar<NoSchedPred,                          [WriteFLogic]>
867]>;
868def : InstRW<[JWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr,
869                                         ANDNPSrr, VANDNPSrr,
870                                         ANDNPDrr, VANDNPDrr)>;
871
872def JWriteFZeroIdiomY : SchedWriteVariant<[
873    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroIdiomYmm]>,
874    SchedVar<NoSchedPred,                          [WriteFLogicY]>
875]>;
876def : InstRW<[JWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
877                                          VANDNPSYrr, VANDNPDYrr)>;
878
879def JWriteVZeroIdiomLogic : SchedWriteVariant<[
880    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
881    SchedVar<NoSchedPred,                          [WriteVecLogic]>
882]>;
883def : InstRW<[JWriteVZeroIdiomLogic], (instrs MMX_PXORirr, MMX_PANDNirr)>;
884
885def JWriteVZeroIdiomLogicX : SchedWriteVariant<[
886    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
887    SchedVar<NoSchedPred,                          [WriteVecLogicX]>
888]>;
889def : InstRW<[JWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr,
890                                               PANDNrr, VPANDNrr)>;
891
892def JWriteVZeroIdiomALU : SchedWriteVariant<[
893    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
894    SchedVar<NoSchedPred,                          [WriteVecALU]>
895]>;
896def : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBirr, MMX_PSUBDirr,
897                                            MMX_PSUBQirr, MMX_PSUBWirr,
898                                            MMX_PSUBSBirr, MMX_PSUBSWirr,
899                                            MMX_PSUBUSBirr, MMX_PSUBUSWirr,
900                                            MMX_PCMPGTBirr, MMX_PCMPGTDirr,
901                                            MMX_PCMPGTWirr)>;
902
903def JWriteVZeroIdiomALUX : SchedWriteVariant<[
904    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
905    SchedVar<NoSchedPred,                          [WriteVecALUX]>
906]>;
907def : InstRW<[JWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
908                                             PSUBDrr, VPSUBDrr,
909                                             PSUBQrr, VPSUBQrr,
910                                             PSUBWrr, VPSUBWrr,
911                                             PSUBSBrr, VPSUBSBrr,
912                                             PSUBSWrr, VPSUBSWrr,
913                                             PSUBUSBrr, VPSUBUSBrr,
914                                             PSUBUSWrr, VPSUBUSWrr,
915                                             PCMPGTBrr, VPCMPGTBrr,
916                                             PCMPGTDrr, VPCMPGTDrr,
917                                             PCMPGTQrr, VPCMPGTQrr,
918                                             PCMPGTWrr, VPCMPGTWrr)>;
919
920def JWriteVPERM2F128 : SchedWriteVariant<[
921  SchedVar<MCSchedPredicate<ZeroIdiomVPERMPredicate>, [JWriteZeroIdiomYmm]>,
922  SchedVar<NoSchedPred,                               [WriteFShuffle256]>
923]>;
924def : InstRW<[JWriteVPERM2F128], (instrs VPERM2F128rr)>;
925
926// This write is used for slow LEA instructions.
927def JWrite3OpsLEA : SchedWriteRes<[JALU1, JSAGU]> {
928  let Latency = 2;
929}
930
931// On Jaguar, a slow LEA is either a 3Ops LEA (base, index, offset), or an LEA
932// with a `Scale` value different than 1.
933def JSlowLEAPredicate : MCSchedPredicate<
934  CheckAny<[
935    // A 3-operand LEA (base, index, offset).
936    IsThreeOperandsLEAFn,
937    // An LEA with a "Scale" different than 1.
938    CheckAll<[
939      CheckIsImmOperand<2>,
940      CheckNot<CheckImmOperand<2, 1>>
941    ]>
942  ]>
943>;
944
945def JWriteLEA : SchedWriteVariant<[
946    SchedVar<JSlowLEAPredicate, [JWrite3OpsLEA]>,
947    SchedVar<NoSchedPred,       [WriteLEA]>
948]>;
949
950def : InstRW<[JWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
951
952def JSlowLEA16r : SchedWriteRes<[JALU01]> {
953  let Latency = 3;
954  let ResourceCycles = [4];
955}
956
957def : InstRW<[JSlowLEA16r], (instrs LEA16r)>;
958
959///////////////////////////////////////////////////////////////////////////////
960// Dependency breaking instructions.
961///////////////////////////////////////////////////////////////////////////////
962
963def : IsZeroIdiomFunction<[
964  // GPR Zero-idioms.
965  DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>,
966
967  // MMX Zero-idioms.
968  DepBreakingClass<[
969    MMX_PXORirr, MMX_PANDNirr, MMX_PSUBBirr,
970    MMX_PSUBDirr, MMX_PSUBQirr, MMX_PSUBWirr,
971    MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr,
972    MMX_PCMPGTBirr, MMX_PCMPGTDirr, MMX_PCMPGTWirr
973  ], ZeroIdiomPredicate>,
974
975  // SSE Zero-idioms.
976  DepBreakingClass<[
977    // fp variants.
978    XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr,
979
980    // int variants.
981    PXORrr, PANDNrr,
982    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
983    PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr,
984    PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr
985  ], ZeroIdiomPredicate>,
986
987  // AVX Zero-idioms.
988  DepBreakingClass<[
989    // xmm fp variants.
990    VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr,
991
992    // xmm int variants.
993    VPXORrr, VPANDNrr,
994    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
995    VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr,
996    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
997
998    // ymm variants.
999    VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr
1000  ], ZeroIdiomPredicate>,
1001
1002  DepBreakingClass<[ VPERM2F128rr ], ZeroIdiomVPERMPredicate>
1003]>;
1004
1005def : IsDepBreakingFunction<[
1006  // GPR
1007  DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>,
1008  DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >,
1009
1010  // MMX
1011  DepBreakingClass<[
1012    MMX_PCMPEQBirr, MMX_PCMPEQDirr, MMX_PCMPEQWirr
1013  ], ZeroIdiomPredicate>,
1014
1015  // SSE
1016  DepBreakingClass<[ 
1017    PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
1018  ], ZeroIdiomPredicate>,
1019
1020  // AVX
1021  DepBreakingClass<[
1022    VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
1023  ], ZeroIdiomPredicate>
1024]>;
1025
1026def : IsOptimizableRegisterMove<[
1027  InstructionEquivalenceClass<[
1028    // GPR variants.
1029    MOV32rr, MOV64rr,
1030
1031    // MMX variants.
1032    MMX_MOVQ64rr,
1033
1034    // SSE variants.
1035    MOVAPSrr, MOVUPSrr,
1036    MOVAPDrr, MOVUPDrr,
1037    MOVDQArr, MOVDQUrr,
1038
1039    // AVX variants.
1040    VMOVAPSrr, VMOVUPSrr,
1041    VMOVAPDrr, VMOVUPDrr,
1042    VMOVDQArr, VMOVDQUrr
1043  ], TruePred >
1044]>;
1045
1046} // SchedModel
1047