X86InstrSSE.td revision 263508
1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file describes the X86 SSE instruction set, defining the instructions,
11// and properties of the instructions which are needed for code generation,
12// machine code emission, and analysis.
13//
14//===----------------------------------------------------------------------===//
15
16class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> {
17  InstrItinClass rr = arg_rr;
18  InstrItinClass rm = arg_rm;
19  // InstrSchedModel info.
20  X86FoldableSchedWrite Sched = WriteFAdd;
21}
22
23class SizeItins<OpndItins arg_s, OpndItins arg_d> {
24  OpndItins s = arg_s;
25  OpndItins d = arg_d;
26}
27
28
29class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
30  InstrItinClass arg_ri> {
31  InstrItinClass rr = arg_rr;
32  InstrItinClass rm = arg_rm;
33  InstrItinClass ri = arg_ri;
34}
35
36
37// scalar
38let Sched = WriteFAdd in {
39def SSE_ALU_F32S : OpndItins<
40  IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM
41>;
42
43def SSE_ALU_F64S : OpndItins<
44  IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM
45>;
46}
47
48def SSE_ALU_ITINS_S : SizeItins<
49  SSE_ALU_F32S, SSE_ALU_F64S
50>;
51
52let Sched = WriteFMul in {
53def SSE_MUL_F32S : OpndItins<
54  IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM
55>;
56
57def SSE_MUL_F64S : OpndItins<
58  IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM
59>;
60}
61
62def SSE_MUL_ITINS_S : SizeItins<
63  SSE_MUL_F32S, SSE_MUL_F64S
64>;
65
66let Sched = WriteFDiv in {
67def SSE_DIV_F32S : OpndItins<
68  IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM
69>;
70
71def SSE_DIV_F64S : OpndItins<
72  IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM
73>;
74}
75
76def SSE_DIV_ITINS_S : SizeItins<
77  SSE_DIV_F32S, SSE_DIV_F64S
78>;
79
80// parallel
81let Sched = WriteFAdd in {
82def SSE_ALU_F32P : OpndItins<
83  IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
84>;
85
86def SSE_ALU_F64P : OpndItins<
87  IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM
88>;
89}
90
91def SSE_ALU_ITINS_P : SizeItins<
92  SSE_ALU_F32P, SSE_ALU_F64P
93>;
94
95let Sched = WriteFMul in {
96def SSE_MUL_F32P : OpndItins<
97  IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM
98>;
99
100def SSE_MUL_F64P : OpndItins<
101  IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM
102>;
103}
104
105def SSE_MUL_ITINS_P : SizeItins<
106  SSE_MUL_F32P, SSE_MUL_F64P
107>;
108
109let Sched = WriteFDiv in {
110def SSE_DIV_F32P : OpndItins<
111  IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM
112>;
113
114def SSE_DIV_F64P : OpndItins<
115  IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM
116>;
117}
118
119def SSE_DIV_ITINS_P : SizeItins<
120  SSE_DIV_F32P, SSE_DIV_F64P
121>;
122
123def SSE_BIT_ITINS_P : OpndItins<
124  IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
125>;
126
127let Sched = WriteVecALU in {
128def SSE_INTALU_ITINS_P : OpndItins<
129  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
130>;
131
132def SSE_INTALUQ_ITINS_P : OpndItins<
133  IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM
134>;
135}
136
137let Sched = WriteVecIMul in
138def SSE_INTMUL_ITINS_P : OpndItins<
139  IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM
140>;
141
142def SSE_INTSHIFT_ITINS_P : ShiftOpndItins<
143  IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI
144>;
145
146def SSE_MOVA_ITINS : OpndItins<
147  IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM
148>;
149
150def SSE_MOVU_ITINS : OpndItins<
151  IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
152>;
153
154def SSE_DPPD_ITINS : OpndItins<
155  IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM
156>;
157
158def SSE_DPPS_ITINS : OpndItins<
159  IIC_SSE_DPPS_RR, IIC_SSE_DPPD_RM
160>;
161
162def DEFAULT_ITINS : OpndItins<
163  IIC_ALU_NONMEM, IIC_ALU_MEM
164>;
165
166def SSE_EXTRACT_ITINS : OpndItins<
167  IIC_SSE_EXTRACTPS_RR, IIC_SSE_EXTRACTPS_RM
168>;
169
170def SSE_INSERT_ITINS : OpndItins<
171  IIC_SSE_INSERTPS_RR, IIC_SSE_INSERTPS_RM
172>;
173
174def SSE_MPSADBW_ITINS : OpndItins<
175  IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM
176>;
177
178def SSE_PMULLD_ITINS : OpndItins<
179  IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM
180>;
181
182//===----------------------------------------------------------------------===//
183// SSE 1 & 2 Instructions Classes
184//===----------------------------------------------------------------------===//
185
186/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
187multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
188                           RegisterClass RC, X86MemOperand x86memop,
189                           OpndItins itins,
190                           bit Is2Addr = 1> {
191  let isCommutable = 1 in {
192    def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
193       !if(Is2Addr,
194           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
195           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
196       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr>,
197       Sched<[itins.Sched]>;
198  }
199  def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
200       !if(Is2Addr,
201           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
202           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
203       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm>,
204       Sched<[itins.Sched.Folded, ReadAfterLd]>;
205}
206
207/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
208multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
209                             string asm, string SSEVer, string FPSizeStr,
210                             Operand memopr, ComplexPattern mem_cpat,
211                             OpndItins itins,
212                             bit Is2Addr = 1> {
213  def rr_Int : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
214       !if(Is2Addr,
215           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
216           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
217       [(set RC:$dst, (!cast<Intrinsic>(
218                 !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr))
219             RC:$src1, RC:$src2))], itins.rr>,
220       Sched<[itins.Sched]>;
221  def rm_Int : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
222       !if(Is2Addr,
223           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
224           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
225       [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse",
226                                          SSEVer, "_", OpcodeStr, FPSizeStr))
227             RC:$src1, mem_cpat:$src2))], itins.rm>,
228       Sched<[itins.Sched.Folded, ReadAfterLd]>;
229}
230
231/// sse12_fp_packed - SSE 1 & 2 packed instructions class
232multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
233                           RegisterClass RC, ValueType vt,
234                           X86MemOperand x86memop, PatFrag mem_frag,
235                           Domain d, OpndItins itins, bit Is2Addr = 1> {
236  let isCommutable = 1 in
237    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
238       !if(Is2Addr,
239           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
240           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
241       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
242       Sched<[itins.Sched]>;
243  let mayLoad = 1 in
244    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
245       !if(Is2Addr,
246           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
247           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
248       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
249          itins.rm, d>,
250       Sched<[itins.Sched.Folded, ReadAfterLd]>;
251}
252
253/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
254multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
255                                      string OpcodeStr, X86MemOperand x86memop,
256                                      list<dag> pat_rr, list<dag> pat_rm,
257                                      bit Is2Addr = 1> {
258  let isCommutable = 1, hasSideEffects = 0 in
259    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
260       !if(Is2Addr,
261           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
262           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
263       pat_rr, NoItinerary, d>,
264       Sched<[WriteVecLogic]>;
265  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
266       !if(Is2Addr,
267           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
268           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
269       pat_rm, NoItinerary, d>,
270       Sched<[WriteVecLogicLd, ReadAfterLd]>;
271}
272
273//===----------------------------------------------------------------------===//
274//  Non-instruction patterns
275//===----------------------------------------------------------------------===//
276
277// A vector extract of the first f32/f64 position is a subregister copy
278def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
279          (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
280def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
281          (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
282
283// A 128-bit subvector extract from the first 256-bit vector position
284// is a subregister copy that needs no instruction.
285def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))),
286          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
287def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))),
288          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;
289
290def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))),
291          (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
292def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))),
293          (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;
294
295def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))),
296          (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
297def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))),
298          (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;
299
300// A 128-bit subvector insert to the first 256-bit vector position
301// is a subregister copy that needs no instruction.
302let AddedComplexity = 25 in { // to give priority over vinsertf128rm
303def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)),
304          (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
305def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)),
306          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
307def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)),
308          (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
309def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)),
310          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
311def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)),
312          (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
313def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)),
314          (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
315}
316
317// Implicitly promote a 32-bit scalar to a vector.
318def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
319          (COPY_TO_REGCLASS FR32:$src, VR128)>;
320def : Pat<(v8f32 (scalar_to_vector FR32:$src)),
321          (COPY_TO_REGCLASS FR32:$src, VR128)>;
322// Implicitly promote a 64-bit scalar to a vector.
323def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
324          (COPY_TO_REGCLASS FR64:$src, VR128)>;
325def : Pat<(v4f64 (scalar_to_vector FR64:$src)),
326          (COPY_TO_REGCLASS FR64:$src, VR128)>;
327
328// Bitcasts between 128-bit vector types. Return the original type since
329// no instruction is needed for the conversion
330let Predicates = [HasSSE2] in {
331  def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
332  def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
333  def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
334  def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
335  def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
336  def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
337  def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
338  def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
339  def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
340  def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
341  def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
342  def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
343  def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
344  def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
345  def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
346  def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
347  def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
348  def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
349  def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
350  def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
351  def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
352  def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
353  def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
354  def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
355  def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
356  def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
357  def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
358  def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
359  def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
360  def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
361}
362
363// Bitcasts between 256-bit vector types. Return the original type since
364// no instruction is needed for the conversion
365let Predicates = [HasAVX] in {
366  def : Pat<(v4f64  (bitconvert (v8f32 VR256:$src))),  (v4f64 VR256:$src)>;
367  def : Pat<(v4f64  (bitconvert (v8i32 VR256:$src))),  (v4f64 VR256:$src)>;
368  def : Pat<(v4f64  (bitconvert (v4i64 VR256:$src))),  (v4f64 VR256:$src)>;
369  def : Pat<(v4f64  (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>;
370  def : Pat<(v4f64  (bitconvert (v32i8 VR256:$src))),  (v4f64 VR256:$src)>;
371  def : Pat<(v8f32  (bitconvert (v8i32 VR256:$src))),  (v8f32 VR256:$src)>;
372  def : Pat<(v8f32  (bitconvert (v4i64 VR256:$src))),  (v8f32 VR256:$src)>;
373  def : Pat<(v8f32  (bitconvert (v4f64 VR256:$src))),  (v8f32 VR256:$src)>;
374  def : Pat<(v8f32  (bitconvert (v32i8 VR256:$src))),  (v8f32 VR256:$src)>;
375  def : Pat<(v8f32  (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>;
376  def : Pat<(v4i64  (bitconvert (v8f32 VR256:$src))),  (v4i64 VR256:$src)>;
377  def : Pat<(v4i64  (bitconvert (v8i32 VR256:$src))),  (v4i64 VR256:$src)>;
378  def : Pat<(v4i64  (bitconvert (v4f64 VR256:$src))),  (v4i64 VR256:$src)>;
379  def : Pat<(v4i64  (bitconvert (v32i8 VR256:$src))),  (v4i64 VR256:$src)>;
380  def : Pat<(v4i64  (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>;
381  def : Pat<(v32i8  (bitconvert (v4f64 VR256:$src))),  (v32i8 VR256:$src)>;
382  def : Pat<(v32i8  (bitconvert (v4i64 VR256:$src))),  (v32i8 VR256:$src)>;
383  def : Pat<(v32i8  (bitconvert (v8f32 VR256:$src))),  (v32i8 VR256:$src)>;
384  def : Pat<(v32i8  (bitconvert (v8i32 VR256:$src))),  (v32i8 VR256:$src)>;
385  def : Pat<(v32i8  (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>;
386  def : Pat<(v8i32  (bitconvert (v32i8 VR256:$src))),  (v8i32 VR256:$src)>;
387  def : Pat<(v8i32  (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>;
388  def : Pat<(v8i32  (bitconvert (v8f32 VR256:$src))),  (v8i32 VR256:$src)>;
389  def : Pat<(v8i32  (bitconvert (v4i64 VR256:$src))),  (v8i32 VR256:$src)>;
390  def : Pat<(v8i32  (bitconvert (v4f64 VR256:$src))),  (v8i32 VR256:$src)>;
391  def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))),  (v16i16 VR256:$src)>;
392  def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))),  (v16i16 VR256:$src)>;
393  def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))),  (v16i16 VR256:$src)>;
394  def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))),  (v16i16 VR256:$src)>;
395  def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))),  (v16i16 VR256:$src)>;
396}
397
398// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
399// This is expanded by ExpandPostRAPseudos.
400let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
401    isPseudo = 1, SchedRW = [WriteZero] in {
402  def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
403                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>;
404  def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
405                   [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>;
406}
407
408//===----------------------------------------------------------------------===//
409// AVX & SSE - Zero/One Vectors
410//===----------------------------------------------------------------------===//
411
412// Alias instruction that maps zero vector to pxor / xorp* for sse.
413// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
414// swizzled by ExecutionDepsFix to pxor.
415// We set canFoldAsLoad because this can be converted to a constant-pool
416// load of an all-zeros value if folding it would be beneficial.
417let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
418    isPseudo = 1, SchedRW = [WriteZero] in {
419def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
420               [(set VR128:$dst, (v4f32 immAllZerosV))]>;
421}
422
423def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
424def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
425def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
426def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
427def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
428
429
430// The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
431// and doesn't need it because on sandy bridge the register is set to zero
432// at the rename stage without using any execution unit, so SET0PSY
433// and SET0PDY can be used for vector int instructions without penalty
434let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
435    isPseudo = 1, Predicates = [HasAVX], SchedRW = [WriteZero] in {
436def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
437                 [(set VR256:$dst, (v8f32 immAllZerosV))]>;
438}
439
440let Predicates = [HasAVX] in
441  def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
442
443let Predicates = [HasAVX2] in {
444  def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
445  def : Pat<(v8i32 immAllZerosV), (AVX_SET0)>;
446  def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
447  def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
448}
449
450// AVX1 has no support for 256-bit integer instructions, but since the 128-bit
451// VPXOR instruction writes zero to its upper part, it's safe build zeros.
452let Predicates = [HasAVX1Only] in {
453def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
454def : Pat<(bc_v32i8 (v8f32 immAllZerosV)),
455          (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
456
457def : Pat<(v16i16 immAllZerosV), (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
458def : Pat<(bc_v16i16 (v8f32 immAllZerosV)),
459          (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
460
461def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
462def : Pat<(bc_v8i32 (v8f32 immAllZerosV)),
463          (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
464
465def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
466def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
467          (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
468}
469
470// We set canFoldAsLoad because this can be converted to a constant-pool
471// load of an all-ones value if folding it would be beneficial.
472let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
473    isPseudo = 1, SchedRW = [WriteZero] in {
474  def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
475                       [(set VR128:$dst, (v4i32 immAllOnesV))]>;
476  let Predicates = [HasAVX2] in
477  def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
478                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
479}
480
481
482//===----------------------------------------------------------------------===//
483// SSE 1 & 2 - Move FP Scalar Instructions
484//
485// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
486// register copies because it's a partial register update; Register-to-register
487// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
488// that the insert be implementable in terms of a copy, and just mentioned, we
489// don't use movss/movsd for copies.
490//===----------------------------------------------------------------------===//
491
492multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
493                         X86MemOperand x86memop, string base_opc,
494                         string asm_opr> {
495  def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
496              (ins VR128:$src1, RC:$src2),
497              !strconcat(base_opc, asm_opr),
498              [(set VR128:$dst, (vt (OpNode VR128:$src1,
499                                 (scalar_to_vector RC:$src2))))],
500              IIC_SSE_MOV_S_RR>, Sched<[WriteMove]>;
501
502  // For the disassembler
503  let isCodeGenOnly = 1, hasSideEffects = 0 in
504  def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
505                  (ins VR128:$src1, RC:$src2),
506                  !strconcat(base_opc, asm_opr),
507                  [], IIC_SSE_MOV_S_RR>, Sched<[WriteMove]>;
508}
509
510multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
511                      X86MemOperand x86memop, string OpcodeStr> {
512  // AVX
513  defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
514                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
515                              VEX_4V, VEX_LIG;
516
517  def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
518                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
519                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
520                     VEX, VEX_LIG, Sched<[WriteStore]>;
521  // SSE1 & 2
522  let Constraints = "$src1 = $dst" in {
523    defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
524                              "\t{$src2, $dst|$dst, $src2}">;
525  }
526
527  def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
528                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
529                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
530                  Sched<[WriteStore]>;
531}
532
533// Loading from memory automatically zeroing upper bits.
534multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
535                         PatFrag mem_pat, string OpcodeStr> {
536  def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
537                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
538                     [(set RC:$dst, (mem_pat addr:$src))],
539                     IIC_SSE_MOV_S_RM>, VEX, VEX_LIG, Sched<[WriteLoad]>;
540  def NAME#rm   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
541                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
542                     [(set RC:$dst, (mem_pat addr:$src))],
543                     IIC_SSE_MOV_S_RM>, Sched<[WriteLoad]>;
544}
545
546defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss">, XS;
547defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd">, XD;
548
549let canFoldAsLoad = 1, isReMaterializable = 1 in {
550  defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS;
551
552  let AddedComplexity = 20 in
553    defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD;
554}
555
556// Patterns
557let Predicates = [UseAVX] in {
558  let AddedComplexity = 15 in {
559  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
560  // MOVS{S,D} to the lower bits.
561  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
562            (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
563  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
564            (VMOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
565  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
566            (VMOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
567  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
568            (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
569
570  // Move low f32 and clear high bits.
571  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
572            (SUBREG_TO_REG (i32 0),
573             (VMOVSSrr (v4f32 (V_SET0)),
574                       (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), sub_xmm)>;
575  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
576            (SUBREG_TO_REG (i32 0),
577             (VMOVSSrr (v4i32 (V_SET0)),
578                       (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), sub_xmm)>;
579  }
580
581  let AddedComplexity = 20 in {
582  // MOVSSrm zeros the high parts of the register; represent this
583  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
584  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
585            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
586  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
587            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
588  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
589            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
590
591  // MOVSDrm zeros the high parts of the register; represent this
592  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
593  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
594            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
595  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
596            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
597  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
598            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
599  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
600            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
601  def : Pat<(v2f64 (X86vzload addr:$src)),
602            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
603
604  // Represent the same patterns above but in the form they appear for
605  // 256-bit types
606  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
607                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
608            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
609  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
610                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
611            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
612  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
613                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
614            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
615  }
616  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
617                   (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),
618            (SUBREG_TO_REG (i32 0),
619                           (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
620                           sub_xmm)>;
621  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
622                   (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),
623            (SUBREG_TO_REG (i64 0),
624                           (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
625                           sub_xmm)>;
626  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
627                   (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
628            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>;
629
630  // Move low f64 and clear high bits.
631  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
632            (SUBREG_TO_REG (i32 0),
633             (VMOVSDrr (v2f64 (V_SET0)),
634                       (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), sub_xmm)>;
635
636  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
637            (SUBREG_TO_REG (i32 0),
638             (VMOVSDrr (v2i64 (V_SET0)),
639                       (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), sub_xmm)>;
640
641  // Extract and store.
642  def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
643                   addr:$dst),
644            (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
645  def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
646                   addr:$dst),
647            (VMOVSDmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64))>;
648
649  // Shuffle with VMOVSS
650  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
651            (VMOVSSrr (v4i32 VR128:$src1),
652                      (COPY_TO_REGCLASS (v4i32 VR128:$src2), FR32))>;
653  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
654            (VMOVSSrr (v4f32 VR128:$src1),
655                      (COPY_TO_REGCLASS (v4f32 VR128:$src2), FR32))>;
656
657  // 256-bit variants
658  def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)),
659            (SUBREG_TO_REG (i32 0),
660              (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_xmm),
661                        (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_xmm)),
662              sub_xmm)>;
663  def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)),
664            (SUBREG_TO_REG (i32 0),
665              (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_xmm),
666                        (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_xmm)),
667              sub_xmm)>;
668
669  // Shuffle with VMOVSD
670  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
671            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
672  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
673            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
674  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
675            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
676  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
677            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
678
679  // 256-bit variants
680  def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)),
681            (SUBREG_TO_REG (i32 0),
682              (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_xmm),
683                        (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_xmm)),
684              sub_xmm)>;
685  def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)),
686            (SUBREG_TO_REG (i32 0),
687              (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_xmm),
688                        (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)),
689              sub_xmm)>;
690
691
692  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
693  // is during lowering, where it's not possible to recognize the fold cause
694  // it has two uses through a bitcast. One use disappears at isel time and the
695  // fold opportunity reappears.
696  def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
697            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
698  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
699            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
700  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
701            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
702  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
703            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
704}
705
706let Predicates = [UseSSE1] in {
707  let AddedComplexity = 15 in {
708  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
709  // MOVSS to the lower bits.
710  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
711            (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
712  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
713            (MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
714  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
715            (MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
716  }
717
718  let AddedComplexity = 20 in {
719  // MOVSSrm already zeros the high parts of the register.
720  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
721            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
722  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
723            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
724  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
725            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
726  }
727
728  // Extract and store.
729  def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
730                   addr:$dst),
731            (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
732
733  // Shuffle with MOVSS
734  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
735            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
736  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
737            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
738}
739
740let Predicates = [UseSSE2] in {
741  let AddedComplexity = 15 in {
742  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
743  // MOVSD to the lower bits.
744  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
745            (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
746  }
747
748  let AddedComplexity = 20 in {
749  // MOVSDrm already zeros the high parts of the register.
750  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
751            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
752  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
753            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
754  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
755            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
756  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
757            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
758  def : Pat<(v2f64 (X86vzload addr:$src)),
759            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
760  }
761
762  // Extract and store.
763  def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
764                   addr:$dst),
765            (MOVSDmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR64))>;
766
767  // Shuffle with MOVSD
768  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
769            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
770  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
771            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
772  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
773            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
774  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
775            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
776
777  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
778  // is during lowering, where it's not possible to recognize the fold cause
779  // it has two uses through a bitcast. One use disappears at isel time and the
780  // fold opportunity reappears.
781  def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
782            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
783  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
784            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
785  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
786            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
787  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
788            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
789}
790
791//===----------------------------------------------------------------------===//
792// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
793//===----------------------------------------------------------------------===//
794
795multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
796                            X86MemOperand x86memop, PatFrag ld_frag,
797                            string asm, Domain d,
798                            OpndItins itins,
799                            bit IsReMaterializable = 1> {
800let neverHasSideEffects = 1 in
801  def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
802              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>,
803           Sched<[WriteMove]>;
804let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
805  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
806              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
807                   [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>,
808           Sched<[WriteLoad]>;
809}
810
811defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
812                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
813                              TB, VEX;
814defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
815                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
816                              TB, OpSize, VEX;
817defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
818                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
819                              TB, VEX;
820defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
821                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
822                              TB, OpSize, VEX;
823
824defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
825                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
826                              TB, VEX, VEX_L;
827defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
828                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
829                              TB, OpSize, VEX, VEX_L;
830defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
831                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
832                              TB, VEX, VEX_L;
833defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
834                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
835                              TB, OpSize, VEX, VEX_L;
836defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
837                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
838                              TB;
839defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
840                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
841                              TB, OpSize;
842defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
843                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
844                              TB;
845defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
846                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
847                              TB, OpSize;
848
849let SchedRW = [WriteStore] in {
850def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
851                   "movaps\t{$src, $dst|$dst, $src}",
852                   [(alignedstore (v4f32 VR128:$src), addr:$dst)],
853                   IIC_SSE_MOVA_P_MR>, VEX;
854def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
855                   "movapd\t{$src, $dst|$dst, $src}",
856                   [(alignedstore (v2f64 VR128:$src), addr:$dst)],
857                   IIC_SSE_MOVA_P_MR>, VEX;
858def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
859                   "movups\t{$src, $dst|$dst, $src}",
860                   [(store (v4f32 VR128:$src), addr:$dst)],
861                   IIC_SSE_MOVU_P_MR>, VEX;
862def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
863                   "movupd\t{$src, $dst|$dst, $src}",
864                   [(store (v2f64 VR128:$src), addr:$dst)],
865                   IIC_SSE_MOVU_P_MR>, VEX;
866def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
867                   "movaps\t{$src, $dst|$dst, $src}",
868                   [(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
869                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
870def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
871                   "movapd\t{$src, $dst|$dst, $src}",
872                   [(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
873                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
874def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
875                   "movups\t{$src, $dst|$dst, $src}",
876                   [(store (v8f32 VR256:$src), addr:$dst)],
877                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
878def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
879                   "movupd\t{$src, $dst|$dst, $src}",
880                   [(store (v4f64 VR256:$src), addr:$dst)],
881                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
882} // SchedRW
883
884// For disassembler
885let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in {
886  def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
887                          (ins VR128:$src),
888                          "movaps\t{$src, $dst|$dst, $src}", [],
889                          IIC_SSE_MOVA_P_RR>, VEX;
890  def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
891                           (ins VR128:$src),
892                           "movapd\t{$src, $dst|$dst, $src}", [],
893                           IIC_SSE_MOVA_P_RR>, VEX;
894  def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
895                           (ins VR128:$src),
896                           "movups\t{$src, $dst|$dst, $src}", [],
897                           IIC_SSE_MOVU_P_RR>, VEX;
898  def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
899                           (ins VR128:$src),
900                           "movupd\t{$src, $dst|$dst, $src}", [],
901                           IIC_SSE_MOVU_P_RR>, VEX;
902  def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
903                            (ins VR256:$src),
904                            "movaps\t{$src, $dst|$dst, $src}", [],
905                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
906  def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
907                            (ins VR256:$src),
908                            "movapd\t{$src, $dst|$dst, $src}", [],
909                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
910  def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
911                            (ins VR256:$src),
912                            "movups\t{$src, $dst|$dst, $src}", [],
913                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
914  def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
915                            (ins VR256:$src),
916                            "movupd\t{$src, $dst|$dst, $src}", [],
917                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
918}
919
920let Predicates = [HasAVX] in {
921def : Pat<(v8i32 (X86vzmovl
922                  (insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)))),
923          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
924def : Pat<(v4i64 (X86vzmovl
925                  (insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)))),
926          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
927def : Pat<(v8f32 (X86vzmovl
928                  (insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)))),
929          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
930def : Pat<(v4f64 (X86vzmovl
931                  (insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)))),
932          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
933}
934
935
936def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
937          (VMOVUPSYmr addr:$dst, VR256:$src)>;
938def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src),
939          (VMOVUPDYmr addr:$dst, VR256:$src)>;
940
941let SchedRW = [WriteStore] in {
942def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
943                   "movaps\t{$src, $dst|$dst, $src}",
944                   [(alignedstore (v4f32 VR128:$src), addr:$dst)],
945                   IIC_SSE_MOVA_P_MR>;
946def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
947                   "movapd\t{$src, $dst|$dst, $src}",
948                   [(alignedstore (v2f64 VR128:$src), addr:$dst)],
949                   IIC_SSE_MOVA_P_MR>;
950def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
951                   "movups\t{$src, $dst|$dst, $src}",
952                   [(store (v4f32 VR128:$src), addr:$dst)],
953                   IIC_SSE_MOVU_P_MR>;
954def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
955                   "movupd\t{$src, $dst|$dst, $src}",
956                   [(store (v2f64 VR128:$src), addr:$dst)],
957                   IIC_SSE_MOVU_P_MR>;
958} // SchedRW
959
960// For disassembler
961let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in {
962  def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
963                         "movaps\t{$src, $dst|$dst, $src}", [],
964                         IIC_SSE_MOVA_P_RR>;
965  def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
966                         "movapd\t{$src, $dst|$dst, $src}", [],
967                         IIC_SSE_MOVA_P_RR>;
968  def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
969                         "movups\t{$src, $dst|$dst, $src}", [],
970                         IIC_SSE_MOVU_P_RR>;
971  def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
972                         "movupd\t{$src, $dst|$dst, $src}", [],
973                         IIC_SSE_MOVU_P_RR>;
974}
975
976let Predicates = [HasAVX] in {
977  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
978            (VMOVUPSmr addr:$dst, VR128:$src)>;
979  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
980            (VMOVUPDmr addr:$dst, VR128:$src)>;
981}
982
983let Predicates = [UseSSE1] in
984  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
985            (MOVUPSmr addr:$dst, VR128:$src)>;
986let Predicates = [UseSSE2] in
987  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
988            (MOVUPDmr addr:$dst, VR128:$src)>;
989
990// Use vmovaps/vmovups for AVX integer load/store.
991let Predicates = [HasAVX] in {
992  // 128-bit load/store
993  def : Pat<(alignedloadv2i64 addr:$src),
994            (VMOVAPSrm addr:$src)>;
995  def : Pat<(loadv2i64 addr:$src),
996            (VMOVUPSrm addr:$src)>;
997
998  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
999            (VMOVAPSmr addr:$dst, VR128:$src)>;
1000  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
1001            (VMOVAPSmr addr:$dst, VR128:$src)>;
1002  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
1003            (VMOVAPSmr addr:$dst, VR128:$src)>;
1004  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
1005            (VMOVAPSmr addr:$dst, VR128:$src)>;
1006  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
1007            (VMOVUPSmr addr:$dst, VR128:$src)>;
1008  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
1009            (VMOVUPSmr addr:$dst, VR128:$src)>;
1010  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
1011            (VMOVUPSmr addr:$dst, VR128:$src)>;
1012  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
1013            (VMOVUPSmr addr:$dst, VR128:$src)>;
1014
1015  // 256-bit load/store
1016  def : Pat<(alignedloadv4i64 addr:$src),
1017            (VMOVAPSYrm addr:$src)>;
1018  def : Pat<(loadv4i64 addr:$src),
1019            (VMOVUPSYrm addr:$src)>;
1020  def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst),
1021            (VMOVAPSYmr addr:$dst, VR256:$src)>;
1022  def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
1023            (VMOVAPSYmr addr:$dst, VR256:$src)>;
1024  def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
1025            (VMOVAPSYmr addr:$dst, VR256:$src)>;
1026  def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
1027            (VMOVAPSYmr addr:$dst, VR256:$src)>;
1028  def : Pat<(store (v4i64 VR256:$src), addr:$dst),
1029            (VMOVUPSYmr addr:$dst, VR256:$src)>;
1030  def : Pat<(store (v8i32 VR256:$src), addr:$dst),
1031            (VMOVUPSYmr addr:$dst, VR256:$src)>;
1032  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
1033            (VMOVUPSYmr addr:$dst, VR256:$src)>;
1034  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
1035            (VMOVUPSYmr addr:$dst, VR256:$src)>;
1036
1037  // Special patterns for storing subvector extracts of lower 128-bits
1038  // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
1039  def : Pat<(alignedstore (v2f64 (extract_subvector
1040                                  (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
1041            (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1042  def : Pat<(alignedstore (v4f32 (extract_subvector
1043                                  (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
1044            (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1045  def : Pat<(alignedstore (v2i64 (extract_subvector
1046                                  (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
1047            (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1048  def : Pat<(alignedstore (v4i32 (extract_subvector
1049                                  (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
1050            (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1051  def : Pat<(alignedstore (v8i16 (extract_subvector
1052                                  (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
1053            (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1054  def : Pat<(alignedstore (v16i8 (extract_subvector
1055                                  (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
1056            (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1057
1058  def : Pat<(store (v2f64 (extract_subvector
1059                           (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
1060            (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1061  def : Pat<(store (v4f32 (extract_subvector
1062                           (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
1063            (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1064  def : Pat<(store (v2i64 (extract_subvector
1065                           (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
1066            (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1067  def : Pat<(store (v4i32 (extract_subvector
1068                           (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
1069            (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1070  def : Pat<(store (v8i16 (extract_subvector
1071                           (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
1072            (VMOVUPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1073  def : Pat<(store (v16i8 (extract_subvector
1074                           (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
1075            (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1076}
1077
1078// Use movaps / movups for SSE integer load / store (one byte shorter).
1079// The instructions selected below are then converted to MOVDQA/MOVDQU
1080// during the SSE domain pass.
1081let Predicates = [UseSSE1] in {
1082  def : Pat<(alignedloadv2i64 addr:$src),
1083            (MOVAPSrm addr:$src)>;
1084  def : Pat<(loadv2i64 addr:$src),
1085            (MOVUPSrm addr:$src)>;
1086
1087  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
1088            (MOVAPSmr addr:$dst, VR128:$src)>;
1089  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
1090            (MOVAPSmr addr:$dst, VR128:$src)>;
1091  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
1092            (MOVAPSmr addr:$dst, VR128:$src)>;
1093  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
1094            (MOVAPSmr addr:$dst, VR128:$src)>;
1095  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
1096            (MOVUPSmr addr:$dst, VR128:$src)>;
1097  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
1098            (MOVUPSmr addr:$dst, VR128:$src)>;
1099  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
1100            (MOVUPSmr addr:$dst, VR128:$src)>;
1101  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
1102            (MOVUPSmr addr:$dst, VR128:$src)>;
1103}
1104
1105// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
1106// bits are disregarded. FIXME: Set encoding to pseudo!
1107let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
1108let isCodeGenOnly = 1 in {
1109  def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
1110                         "movaps\t{$src, $dst|$dst, $src}",
1111                         [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
1112                         IIC_SSE_MOVA_P_RM>, VEX;
1113  def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
1114                         "movapd\t{$src, $dst|$dst, $src}",
1115                         [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
1116                         IIC_SSE_MOVA_P_RM>, VEX;
1117  def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
1118                       "movaps\t{$src, $dst|$dst, $src}",
1119                       [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
1120                       IIC_SSE_MOVA_P_RM>;
1121  def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
1122                       "movapd\t{$src, $dst|$dst, $src}",
1123                       [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
1124                       IIC_SSE_MOVA_P_RM>;
1125}
1126}
1127
1128//===----------------------------------------------------------------------===//
1129// SSE 1 & 2 - Move Low packed FP Instructions
1130//===----------------------------------------------------------------------===//
1131
1132multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
1133                                      string base_opc, string asm_opr,
1134                                      InstrItinClass itin> {
1135  def PSrm : PI<opc, MRMSrcMem,
1136         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
1137         !strconcat(base_opc, "s", asm_opr),
1138     [(set VR128:$dst,
1139       (psnode VR128:$src1,
1140              (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
1141              itin, SSEPackedSingle>, TB,
1142     Sched<[WriteShuffleLd, ReadAfterLd]>;
1143
1144  def PDrm : PI<opc, MRMSrcMem,
1145         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
1146         !strconcat(base_opc, "d", asm_opr),
1147     [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
1148                              (scalar_to_vector (loadf64 addr:$src2)))))],
1149              itin, SSEPackedDouble>, TB, OpSize,
1150     Sched<[WriteShuffleLd, ReadAfterLd]>;
1151
1152}
1153
1154multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode,
1155                                 string base_opc, InstrItinClass itin> {
1156  defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
1157                                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1158                                    itin>, VEX_4V;
1159
1160let Constraints = "$src1 = $dst" in
1161  defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
1162                                    "\t{$src2, $dst|$dst, $src2}",
1163                                    itin>;
1164}
1165
1166let AddedComplexity = 20 in {
1167  defm MOVL : sse12_mov_hilo_packed<0x12, X86Movlps, X86Movlpd, "movlp",
1168                                    IIC_SSE_MOV_LH>;
1169}
1170
1171let SchedRW = [WriteStore] in {
1172def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1173                   "movlps\t{$src, $dst|$dst, $src}",
1174                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
1175                                 (iPTR 0))), addr:$dst)],
1176                                 IIC_SSE_MOV_LH>, VEX;
1177def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1178                   "movlpd\t{$src, $dst|$dst, $src}",
1179                   [(store (f64 (vector_extract (v2f64 VR128:$src),
1180                                 (iPTR 0))), addr:$dst)],
1181                                 IIC_SSE_MOV_LH>, VEX;
1182def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1183                   "movlps\t{$src, $dst|$dst, $src}",
1184                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
1185                                 (iPTR 0))), addr:$dst)],
1186                                 IIC_SSE_MOV_LH>;
1187def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1188                   "movlpd\t{$src, $dst|$dst, $src}",
1189                   [(store (f64 (vector_extract (v2f64 VR128:$src),
1190                                 (iPTR 0))), addr:$dst)],
1191                                 IIC_SSE_MOV_LH>;
1192} // SchedRW
1193
1194let Predicates = [HasAVX] in {
1195  // Shuffle with VMOVLPS
1196  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
1197            (VMOVLPSrm VR128:$src1, addr:$src2)>;
1198  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
1199            (VMOVLPSrm VR128:$src1, addr:$src2)>;
1200
1201  // Shuffle with VMOVLPD
1202  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1203            (VMOVLPDrm VR128:$src1, addr:$src2)>;
1204  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1205            (VMOVLPDrm VR128:$src1, addr:$src2)>;
1206
1207  // Store patterns
1208  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
1209                   addr:$src1),
1210            (VMOVLPSmr addr:$src1, VR128:$src2)>;
1211  def : Pat<(store (v4i32 (X86Movlps
1212                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1),
1213            (VMOVLPSmr addr:$src1, VR128:$src2)>;
1214  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1215                   addr:$src1),
1216            (VMOVLPDmr addr:$src1, VR128:$src2)>;
1217  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1218                   addr:$src1),
1219            (VMOVLPDmr addr:$src1, VR128:$src2)>;
1220}
1221
1222let Predicates = [UseSSE1] in {
1223  // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
1224  def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)),
1225                                 (iPTR 0))), addr:$src1),
1226            (MOVLPSmr addr:$src1, VR128:$src2)>;
1227
1228  // Shuffle with MOVLPS
1229  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
1230            (MOVLPSrm VR128:$src1, addr:$src2)>;
1231  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
1232            (MOVLPSrm VR128:$src1, addr:$src2)>;
1233  def : Pat<(X86Movlps VR128:$src1,
1234                      (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
1235            (MOVLPSrm VR128:$src1, addr:$src2)>;
1236
1237  // Store patterns
1238  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
1239                                      addr:$src1),
1240            (MOVLPSmr addr:$src1, VR128:$src2)>;
1241  def : Pat<(store (v4i32 (X86Movlps
1242                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
1243                              addr:$src1),
1244            (MOVLPSmr addr:$src1, VR128:$src2)>;
1245}
1246
1247let Predicates = [UseSSE2] in {
1248  // Shuffle with MOVLPD
1249  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1250            (MOVLPDrm VR128:$src1, addr:$src2)>;
1251  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1252            (MOVLPDrm VR128:$src1, addr:$src2)>;
1253
1254  // Store patterns
1255  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1256                           addr:$src1),
1257            (MOVLPDmr addr:$src1, VR128:$src2)>;
1258  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1259                           addr:$src1),
1260            (MOVLPDmr addr:$src1, VR128:$src2)>;
1261}
1262
1263//===----------------------------------------------------------------------===//
1264// SSE 1 & 2 - Move Hi packed FP Instructions
1265//===----------------------------------------------------------------------===//
1266
1267let AddedComplexity = 20 in {
1268  defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Movlhpd, "movhp",
1269                                    IIC_SSE_MOV_LH>;
1270}
1271
1272let SchedRW = [WriteStore] in {
1273// v2f64 extract element 1 is always custom lowered to unpack high to low
1274// and extract element 0 so the non-store version isn't too horrible.
1275def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1276                   "movhps\t{$src, $dst|$dst, $src}",
1277                   [(store (f64 (vector_extract
1278                                 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
1279                                            (bc_v2f64 (v4f32 VR128:$src))),
1280                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
1281def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1282                   "movhpd\t{$src, $dst|$dst, $src}",
1283                   [(store (f64 (vector_extract
1284                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
1285                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
1286def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1287                   "movhps\t{$src, $dst|$dst, $src}",
1288                   [(store (f64 (vector_extract
1289                                 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
1290                                            (bc_v2f64 (v4f32 VR128:$src))),
1291                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
1292def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1293                   "movhpd\t{$src, $dst|$dst, $src}",
1294                   [(store (f64 (vector_extract
1295                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
1296                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
1297} // SchedRW
1298
1299let Predicates = [HasAVX] in {
1300  // VMOVHPS patterns
1301  def : Pat<(X86Movlhps VR128:$src1,
1302                 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
1303            (VMOVHPSrm VR128:$src1, addr:$src2)>;
1304  def : Pat<(X86Movlhps VR128:$src1,
1305                 (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
1306            (VMOVHPSrm VR128:$src1, addr:$src2)>;
1307
1308  // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
1309  // is during lowering, where it's not possible to recognize the load fold
1310  // cause it has two uses through a bitcast. One use disappears at isel time
1311  // and the fold opportunity reappears.
1312  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
1313                      (scalar_to_vector (loadf64 addr:$src2)))),
1314            (VMOVHPDrm VR128:$src1, addr:$src2)>;
1315}
1316
1317let Predicates = [UseSSE1] in {
1318  // MOVHPS patterns
1319  def : Pat<(X86Movlhps VR128:$src1,
1320                 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
1321            (MOVHPSrm VR128:$src1, addr:$src2)>;
1322  def : Pat<(X86Movlhps VR128:$src1,
1323                 (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
1324            (MOVHPSrm VR128:$src1, addr:$src2)>;
1325}
1326
1327let Predicates = [UseSSE2] in {
1328  // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
1329  // is during lowering, where it's not possible to recognize the load fold
1330  // cause it has two uses through a bitcast. One use disappears at isel time
1331  // and the fold opportunity reappears.
1332  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
1333                      (scalar_to_vector (loadf64 addr:$src2)))),
1334            (MOVHPDrm VR128:$src1, addr:$src2)>;
1335}
1336
1337//===----------------------------------------------------------------------===//
1338// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
1339//===----------------------------------------------------------------------===//
1340
1341let AddedComplexity = 20, Predicates = [UseAVX] in {
1342  def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
1343                                       (ins VR128:$src1, VR128:$src2),
1344                      "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1345                      [(set VR128:$dst,
1346                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
1347                        IIC_SSE_MOV_LH>,
1348                      VEX_4V, Sched<[WriteShuffle]>;
1349  def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
1350                                       (ins VR128:$src1, VR128:$src2),
1351                      "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1352                      [(set VR128:$dst,
1353                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
1354                        IIC_SSE_MOV_LH>,
1355                      VEX_4V, Sched<[WriteShuffle]>;
1356}
1357let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
1358  def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
1359                                       (ins VR128:$src1, VR128:$src2),
1360                      "movlhps\t{$src2, $dst|$dst, $src2}",
1361                      [(set VR128:$dst,
1362                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
1363                        IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
1364  def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
1365                                       (ins VR128:$src1, VR128:$src2),
1366                      "movhlps\t{$src2, $dst|$dst, $src2}",
1367                      [(set VR128:$dst,
1368                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
1369                        IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
1370}
1371
1372let Predicates = [UseAVX] in {
1373  // MOVLHPS patterns
1374  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
1375            (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
1376  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
1377            (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
1378
1379  // MOVHLPS patterns
1380  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
1381            (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
1382}
1383
1384let Predicates = [UseSSE1] in {
1385  // MOVLHPS patterns
1386  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
1387            (MOVLHPSrr VR128:$src1, VR128:$src2)>;
1388  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
1389            (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
1390
1391  // MOVHLPS patterns
1392  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
1393            (MOVHLPSrr VR128:$src1, VR128:$src2)>;
1394}
1395
1396//===----------------------------------------------------------------------===//
1397// SSE 1 & 2 - Conversion Instructions
1398//===----------------------------------------------------------------------===//
1399
1400def SSE_CVT_PD : OpndItins<
1401  IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
1402>;
1403
1404let Sched = WriteCvtI2F in
1405def SSE_CVT_PS : OpndItins<
1406  IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
1407>;
1408
1409let Sched = WriteCvtI2F in
1410def SSE_CVT_Scalar : OpndItins<
1411  IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
1412>;
1413
1414let Sched = WriteCvtF2I in
1415def SSE_CVT_SS2SI_32 : OpndItins<
1416  IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM
1417>;
1418
1419let Sched = WriteCvtF2I in
1420def SSE_CVT_SS2SI_64 : OpndItins<
1421  IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM
1422>;
1423
1424let Sched = WriteCvtF2I in
1425def SSE_CVT_SD2SI : OpndItins<
1426  IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
1427>;
1428
1429multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1430                     SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
1431                     string asm, OpndItins itins> {
1432  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
1433                        [(set DstRC:$dst, (OpNode SrcRC:$src))],
1434                        itins.rr>, Sched<[itins.Sched]>;
1435  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
1436                        [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
1437                        itins.rm>, Sched<[itins.Sched.Folded]>;
1438}
1439
1440multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1441                       X86MemOperand x86memop, string asm, Domain d,
1442                       OpndItins itins> {
1443let neverHasSideEffects = 1 in {
1444  def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
1445             [], itins.rr, d>, Sched<[itins.Sched]>;
1446  let mayLoad = 1 in
1447  def rm : I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
1448             [], itins.rm, d>, Sched<[itins.Sched.Folded]>;
1449}
1450}
1451
1452multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1453                          X86MemOperand x86memop, string asm> {
1454let neverHasSideEffects = 1, Predicates = [UseAVX] in {
1455  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
1456              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
1457           Sched<[WriteCvtI2F]>;
1458  let mayLoad = 1 in
1459  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1460              (ins DstRC:$src1, x86memop:$src),
1461              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
1462           Sched<[WriteCvtI2FLd, ReadAfterLd]>;
1463} // neverHasSideEffects = 1
1464}
1465
1466let Predicates = [UseAVX] in {
1467defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
1468                                "cvttss2si\t{$src, $dst|$dst, $src}",
1469                                SSE_CVT_SS2SI_32>,
1470                                XS, VEX, VEX_LIG;
1471defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
1472                                "cvttss2si\t{$src, $dst|$dst, $src}",
1473                                SSE_CVT_SS2SI_64>,
1474                                XS, VEX, VEX_W, VEX_LIG;
1475defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
1476                                "cvttsd2si\t{$src, $dst|$dst, $src}",
1477                                SSE_CVT_SD2SI>,
1478                                XD, VEX, VEX_LIG;
1479defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
1480                                "cvttsd2si\t{$src, $dst|$dst, $src}",
1481                                SSE_CVT_SD2SI>,
1482                                XD, VEX, VEX_W, VEX_LIG;
1483
1484def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1485                (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
1486def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1487                (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
1488def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1489                (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
1490def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1491                (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
1492def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1493                (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
1494def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1495                (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
1496def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1497                (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
1498def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1499                (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
1500}
1501// The assembler can recognize rr 64-bit instructions by seeing a rxx
1502// register, but the same isn't true when only using memory operands,
1503// provide other assembly "l" and "q" forms to address this explicitly
1504// where appropriate to do so.
1505defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}">,
1506                                  XS, VEX_4V, VEX_LIG;
1507defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">,
1508                                  XS, VEX_4V, VEX_W, VEX_LIG;
1509defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">,
1510                                  XD, VEX_4V, VEX_LIG;
1511defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
1512                                  XD, VEX_4V, VEX_W, VEX_LIG;
1513
1514let Predicates = [UseAVX] in {
1515  def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
1516                (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src)>;
1517  def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
1518                (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src)>;
1519
1520  def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
1521            (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
1522  def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
1523            (VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>;
1524  def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
1525            (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
1526  def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
1527            (VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>;
1528
1529  def : Pat<(f32 (sint_to_fp GR32:$src)),
1530            (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
1531  def : Pat<(f32 (sint_to_fp GR64:$src)),
1532            (VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>;
1533  def : Pat<(f64 (sint_to_fp GR32:$src)),
1534            (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
1535  def : Pat<(f64 (sint_to_fp GR64:$src)),
1536            (VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>;
1537}
1538
1539defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
1540                      "cvttss2si\t{$src, $dst|$dst, $src}",
1541                      SSE_CVT_SS2SI_32>, XS;
1542defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
1543                      "cvttss2si\t{$src, $dst|$dst, $src}",
1544                      SSE_CVT_SS2SI_64>, XS, REX_W;
1545defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
1546                      "cvttsd2si\t{$src, $dst|$dst, $src}",
1547                      SSE_CVT_SD2SI>, XD;
1548defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
1549                      "cvttsd2si\t{$src, $dst|$dst, $src}",
1550                      SSE_CVT_SD2SI>, XD, REX_W;
1551defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
1552                      "cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
1553                      SSE_CVT_Scalar>, XS;
1554defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
1555                      "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
1556                      SSE_CVT_Scalar>, XS, REX_W;
1557defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
1558                      "cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
1559                      SSE_CVT_Scalar>, XD;
1560defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
1561                      "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
1562                      SSE_CVT_Scalar>, XD, REX_W;
1563
1564def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1565                (CVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
1566def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1567                (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
1568def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1569                (CVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
1570def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1571                (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
1572def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1573                (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
1574def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1575                (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
1576def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1577                (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
1578def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1579                (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
1580
1581def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
1582                (CVTSI2SSrm FR64:$dst, i32mem:$src)>;
1583def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
1584                (CVTSI2SDrm FR64:$dst, i32mem:$src)>;
1585
1586// Conversion Instructions Intrinsics - Match intrinsics which expect MM
1587// and/or XMM operand(s).
1588
1589multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1590                         Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
1591                         string asm, OpndItins itins> {
1592  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
1593              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1594              [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>,
1595           Sched<[itins.Sched]>;
1596  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
1597              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1598              [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>,
1599           Sched<[itins.Sched.Folded]>;
1600}
1601
1602multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
1603                    RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
1604                    PatFrag ld_frag, string asm, OpndItins itins,
1605                    bit Is2Addr = 1> {
1606  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
1607              !if(Is2Addr,
1608                  !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1609                  !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1610              [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
1611              itins.rr>, Sched<[itins.Sched]>;
1612  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1613              (ins DstRC:$src1, x86memop:$src2),
1614              !if(Is2Addr,
1615                  !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1616                  !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1617              [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
1618              itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
1619}
1620
1621let Predicates = [UseAVX] in {
1622defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
1623                  int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si",
1624                  SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
1625defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
1626                    int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si",
1627                    SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
1628}
1629defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
1630                 sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD;
1631defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
1632                   sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W;
1633
1634
1635let Predicates = [UseAVX] in {
1636defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1637          int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
1638          SSE_CVT_Scalar, 0>, XS, VEX_4V;
1639defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1640          int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
1641          SSE_CVT_Scalar, 0>, XS, VEX_4V,
1642          VEX_W;
1643defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1644          int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
1645          SSE_CVT_Scalar, 0>, XD, VEX_4V;
1646defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1647          int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
1648          SSE_CVT_Scalar, 0>, XD,
1649          VEX_4V, VEX_W;
1650}
1651let Constraints = "$src1 = $dst" in {
1652  defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1653                        int_x86_sse_cvtsi2ss, i32mem, loadi32,
1654                        "cvtsi2ss{l}", SSE_CVT_Scalar>, XS;
1655  defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1656                        int_x86_sse_cvtsi642ss, i64mem, loadi64,
1657                        "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W;
1658  defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1659                        int_x86_sse2_cvtsi2sd, i32mem, loadi32,
1660                        "cvtsi2sd{l}", SSE_CVT_Scalar>, XD;
1661  defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1662                        int_x86_sse2_cvtsi642sd, i64mem, loadi64,
1663                        "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W;
1664}
1665
1666/// SSE 1 Only
1667
1668// Aliases for intrinsics
1669let Predicates = [UseAVX] in {
1670defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
1671                                    ssmem, sse_load_f32, "cvttss2si",
1672                                    SSE_CVT_SS2SI_32>, XS, VEX;
1673defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1674                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
1675                                   "cvttss2si", SSE_CVT_SS2SI_64>,
1676                                   XS, VEX, VEX_W;
1677defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
1678                                    sdmem, sse_load_f64, "cvttsd2si",
1679                                    SSE_CVT_SD2SI>, XD, VEX;
1680defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1681                                  int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
1682                                  "cvttsd2si", SSE_CVT_SD2SI>,
1683                                  XD, VEX, VEX_W;
1684}
1685defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
1686                                    ssmem, sse_load_f32, "cvttss2si",
1687                                    SSE_CVT_SS2SI_32>, XS;
1688defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1689                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
1690                                   "cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W;
1691defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
1692                                    sdmem, sse_load_f64, "cvttsd2si",
1693                                    SSE_CVT_SD2SI>, XD;
1694defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1695                                  int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
1696                                  "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W;
1697
1698let Predicates = [UseAVX] in {
1699defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
1700                                  ssmem, sse_load_f32, "cvtss2si",
1701                                  SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
1702defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
1703                                  ssmem, sse_load_f32, "cvtss2si",
1704                                  SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
1705}
1706defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
1707                               ssmem, sse_load_f32, "cvtss2si",
1708                               SSE_CVT_SS2SI_32>, XS;
1709defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
1710                                 ssmem, sse_load_f32, "cvtss2si",
1711                                 SSE_CVT_SS2SI_64>, XS, REX_W;
1712
1713defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
1714                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1715                               SSEPackedSingle, SSE_CVT_PS>,
1716                               TB, VEX, Requires<[HasAVX]>;
1717defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, VR256, i256mem,
1718                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1719                               SSEPackedSingle, SSE_CVT_PS>,
1720                               TB, VEX, VEX_L, Requires<[HasAVX]>;
1721
1722defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
1723                            "cvtdq2ps\t{$src, $dst|$dst, $src}",
1724                            SSEPackedSingle, SSE_CVT_PS>,
1725                            TB, Requires<[UseSSE2]>;
1726
1727let Predicates = [UseAVX] in {
1728def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1729                (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>;
1730def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1731                (VCVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
1732def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1733                (VCVTSD2SIrr GR32:$dst, VR128:$src), 0>;
1734def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1735                (VCVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
1736def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1737                (VCVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
1738def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1739                (VCVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
1740def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1741                (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
1742def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1743                (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
1744}
1745
1746def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1747                (CVTSS2SIrr GR32:$dst, VR128:$src), 0>;
1748def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1749                (CVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
1750def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1751                (CVTSD2SIrr GR32:$dst, VR128:$src), 0>;
1752def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1753                (CVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
1754def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1755                (CVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
1756def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1757                (CVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
1758def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1759                (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
1760def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1761                (CVTSD2SI64rm GR64:$dst, sdmem:$src)>;
1762
1763/// SSE 2 Only
1764
1765// Convert scalar double to scalar single
1766let neverHasSideEffects = 1, Predicates = [UseAVX] in {
1767def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
1768                       (ins FR64:$src1, FR64:$src2),
1769                      "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
1770                      IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG,
1771                      Sched<[WriteCvtF2F]>;
1772let mayLoad = 1 in
1773def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
1774                       (ins FR64:$src1, f64mem:$src2),
1775                      "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1776                      [], IIC_SSE_CVT_Scalar_RM>,
1777                      XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG,
1778                      Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1779}
1780
1781def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
1782          Requires<[UseAVX]>;
1783
1784def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1785                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
1786                      [(set FR32:$dst, (fround FR64:$src))],
1787                      IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>;
1788def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1789                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
1790                      [(set FR32:$dst, (fround (loadf64 addr:$src)))],
1791                      IIC_SSE_CVT_Scalar_RM>,
1792                      XD,
1793                  Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
1794
1795def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
1796                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1797                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1798                       [(set VR128:$dst,
1799                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
1800                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[UseAVX]>,
1801                       Sched<[WriteCvtF2F]>;
1802def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg,
1803                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1804                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1805                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
1806                                          VR128:$src1, sse_load_f64:$src2))],
1807                       IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[UseAVX]>,
1808                       Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1809
1810let Constraints = "$src1 = $dst" in {
1811def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
1812                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1813                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1814                       [(set VR128:$dst,
1815                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
1816                       IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
1817                       Sched<[WriteCvtF2F]>;
1818def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
1819                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1820                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1821                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
1822                                          VR128:$src1, sse_load_f64:$src2))],
1823                       IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>,
1824                       Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1825}
1826
1827// Convert scalar single to scalar double
1828// SSE2 instructions with XS prefix
1829let neverHasSideEffects = 1, Predicates = [UseAVX] in {
1830def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
1831                    (ins FR32:$src1, FR32:$src2),
1832                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1833                    [], IIC_SSE_CVT_Scalar_RR>,
1834                    XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG,
1835                    Sched<[WriteCvtF2F]>;
1836let mayLoad = 1 in
1837def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
1838                    (ins FR32:$src1, f32mem:$src2),
1839                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1840                    [], IIC_SSE_CVT_Scalar_RM>,
1841                    XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>,
1842                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1843}
1844
1845def : Pat<(f64 (fextend FR32:$src)),
1846    (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>;
1847def : Pat<(fextend (loadf32 addr:$src)),
1848    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
1849
1850def : Pat<(extloadf32 addr:$src),
1851    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
1852    Requires<[UseAVX, OptForSize]>;
1853def : Pat<(extloadf32 addr:$src),
1854    (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
1855    Requires<[UseAVX, OptForSpeed]>;
1856
1857def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1858                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1859                   [(set FR64:$dst, (fextend FR32:$src))],
1860                   IIC_SSE_CVT_Scalar_RR>, XS,
1861                 Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>;
1862def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1863                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1864                   [(set FR64:$dst, (extloadf32 addr:$src))],
1865                   IIC_SSE_CVT_Scalar_RM>, XS,
1866                 Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
1867
1868// extload f32 -> f64.  This matches load+fextend because we have a hack in
1869// the isel (PreprocessForFPConvert) that can introduce loads after dag
1870// combine.
1871// Since these loads aren't folded into the fextend, we have to match it
1872// explicitly here.
1873def : Pat<(fextend (loadf32 addr:$src)),
1874          (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
1875def : Pat<(extloadf32 addr:$src),
1876          (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
1877
1878def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
1879                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1880                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1881                    [(set VR128:$dst,
1882                      (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
1883                    IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[UseAVX]>,
1884                    Sched<[WriteCvtF2F]>;
1885def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
1886                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1887                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1888                    [(set VR128:$dst,
1889                      (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
1890                    IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[UseAVX]>,
1891                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1892let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
1893def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
1894                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1895                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1896                    [(set VR128:$dst,
1897                      (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
1898                    IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>,
1899                    Sched<[WriteCvtF2F]>;
1900def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
1901                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1902                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1903                    [(set VR128:$dst,
1904                      (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
1905                    IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>,
1906                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1907}
1908
1909// Convert packed single/double fp to doubleword
1910def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1911                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1912                       [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
1913                       IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
1914def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1915                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1916                       [(set VR128:$dst,
1917                         (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))],
1918                       IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
1919def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1920                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1921                        [(set VR256:$dst,
1922                          (int_x86_avx_cvt_ps2dq_256 VR256:$src))],
1923                        IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
1924def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1925                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1926                        [(set VR256:$dst,
1927                          (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))],
1928                        IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
1929def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1930                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1931                     [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
1932                     IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
1933def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1934                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1935                     [(set VR128:$dst,
1936                       (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
1937                     IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
1938
1939
1940// Convert Packed Double FP to Packed DW Integers
1941let Predicates = [HasAVX] in {
1942// The assembler can recognize rr 256-bit instructions by seeing a ymm
1943// register, but the same isn't true when using memory operands instead.
1944// Provide other assembly rr and rm forms to address this explicitly.
1945def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1946                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1947                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
1948                       VEX, Sched<[WriteCvtF2I]>;
1949
1950// XMM only
1951def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
1952                (VCVTPD2DQrr VR128:$dst, VR128:$src)>;
1953def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1954                       "vcvtpd2dqx\t{$src, $dst|$dst, $src}",
1955                       [(set VR128:$dst,
1956                         (int_x86_sse2_cvtpd2dq (loadv2f64 addr:$src)))]>, VEX,
1957                       Sched<[WriteCvtF2ILd]>;
1958
1959// YMM only
1960def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1961                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
1962                       [(set VR128:$dst,
1963                         (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L,
1964                       Sched<[WriteCvtF2I]>;
1965def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1966                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
1967                       [(set VR128:$dst,
1968                         (int_x86_avx_cvt_pd2dq_256 (loadv4f64 addr:$src)))]>,
1969                       VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
1970def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}",
1971                (VCVTPD2DQYrr VR128:$dst, VR256:$src)>;
1972}
1973
1974def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1975                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
1976                      [(set VR128:$dst,
1977                        (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))],
1978                      IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
1979def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1980                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
1981                      [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
1982                      IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
1983
1984// Convert with truncation packed single/double fp to doubleword
1985// SSE2 packed instructions with XS prefix
1986def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1987                         "cvttps2dq\t{$src, $dst|$dst, $src}",
1988                         [(set VR128:$dst,
1989                           (int_x86_sse2_cvttps2dq VR128:$src))],
1990                         IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
1991def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1992                         "cvttps2dq\t{$src, $dst|$dst, $src}",
1993                         [(set VR128:$dst, (int_x86_sse2_cvttps2dq
1994                                            (loadv4f32 addr:$src)))],
1995                         IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
1996def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1997                          "cvttps2dq\t{$src, $dst|$dst, $src}",
1998                          [(set VR256:$dst,
1999                            (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
2000                          IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
2001def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2002                          "cvttps2dq\t{$src, $dst|$dst, $src}",
2003                          [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
2004                                             (loadv8f32 addr:$src)))],
2005                          IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
2006                          Sched<[WriteCvtF2ILd]>;
2007
2008def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2009                       "cvttps2dq\t{$src, $dst|$dst, $src}",
2010                       [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
2011                       IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
2012def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2013                       "cvttps2dq\t{$src, $dst|$dst, $src}",
2014                       [(set VR128:$dst,
2015                         (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
2016                       IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
2017
2018let Predicates = [HasAVX] in {
2019  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
2020            (VCVTDQ2PSrr VR128:$src)>;
2021  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
2022            (VCVTDQ2PSrm addr:$src)>;
2023
2024  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
2025            (VCVTDQ2PSrr VR128:$src)>;
2026  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))),
2027            (VCVTDQ2PSrm addr:$src)>;
2028
2029  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
2030            (VCVTTPS2DQrr VR128:$src)>;
2031  def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
2032            (VCVTTPS2DQrm addr:$src)>;
2033
2034  def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))),
2035            (VCVTDQ2PSYrr VR256:$src)>;
2036  def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (loadv4i64 addr:$src)))),
2037            (VCVTDQ2PSYrm addr:$src)>;
2038
2039  def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
2040            (VCVTTPS2DQYrr VR256:$src)>;
2041  def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
2042            (VCVTTPS2DQYrm addr:$src)>;
2043}
2044
2045let Predicates = [UseSSE2] in {
2046  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
2047            (CVTDQ2PSrr VR128:$src)>;
2048  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
2049            (CVTDQ2PSrm addr:$src)>;
2050
2051  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
2052            (CVTDQ2PSrr VR128:$src)>;
2053  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
2054            (CVTDQ2PSrm addr:$src)>;
2055
2056  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
2057            (CVTTPS2DQrr VR128:$src)>;
2058  def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
2059            (CVTTPS2DQrm addr:$src)>;
2060}
2061
2062def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2063                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
2064                        [(set VR128:$dst,
2065                              (int_x86_sse2_cvttpd2dq VR128:$src))],
2066                              IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
2067
2068// The assembler can recognize rr 256-bit instructions by seeing a ymm
2069// register, but the same isn't true when using memory operands instead.
2070// Provide other assembly rr and rm forms to address this explicitly.
2071
2072// XMM only
2073def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
2074                (VCVTTPD2DQrr VR128:$dst, VR128:$src)>;
2075def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2076                         "cvttpd2dqx\t{$src, $dst|$dst, $src}",
2077                         [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
2078                                            (loadv2f64 addr:$src)))],
2079                         IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
2080
2081// YMM only
2082def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
2083                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
2084                         [(set VR128:$dst,
2085                           (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
2086                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
2087def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
2088                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
2089                         [(set VR128:$dst,
2090                          (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
2091                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
2092def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
2093                (VCVTTPD2DQYrr VR128:$dst, VR256:$src)>;
2094
2095let Predicates = [HasAVX] in {
2096  def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
2097            (VCVTTPD2DQYrr VR256:$src)>;
2098  def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
2099            (VCVTTPD2DQYrm addr:$src)>;
2100} // Predicates = [HasAVX]
2101
2102def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2103                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
2104                      [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))],
2105                      IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
2106def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
2107                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
2108                      [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
2109                                        (memopv2f64 addr:$src)))],
2110                                        IIC_SSE_CVT_PD_RM>,
2111                      Sched<[WriteCvtF2ILd]>;
2112
2113// Convert packed single to packed double
2114let Predicates = [HasAVX] in {
2115                  // SSE2 instructions without OpSize prefix
2116def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2117                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
2118                     [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
2119                     IIC_SSE_CVT_PD_RR>, TB, VEX, Sched<[WriteCvtF2F]>;
2120def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
2121                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
2122                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
2123                    IIC_SSE_CVT_PD_RM>, TB, VEX, Sched<[WriteCvtF2FLd]>;
2124def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
2125                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
2126                     [(set VR256:$dst,
2127                       (int_x86_avx_cvt_ps2_pd_256 VR128:$src))],
2128                     IIC_SSE_CVT_PD_RR>, TB, VEX, VEX_L, Sched<[WriteCvtF2F]>;
2129def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
2130                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
2131                     [(set VR256:$dst,
2132                       (int_x86_avx_cvt_ps2_pd_256 (loadv4f32 addr:$src)))],
2133                     IIC_SSE_CVT_PD_RM>, TB, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
2134}
2135
2136let Predicates = [UseSSE2] in {
2137def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2138                       "cvtps2pd\t{$src, $dst|$dst, $src}",
2139                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
2140                       IIC_SSE_CVT_PD_RR>, TB, Sched<[WriteCvtF2F]>;
2141def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
2142                   "cvtps2pd\t{$src, $dst|$dst, $src}",
2143                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
2144                   IIC_SSE_CVT_PD_RM>, TB, Sched<[WriteCvtF2FLd]>;
2145}
2146
2147// Convert Packed DW Integers to Packed Double FP
2148let Predicates = [HasAVX] in {
2149let neverHasSideEffects = 1, mayLoad = 1 in
2150def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
2151                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2152                     []>, VEX, Sched<[WriteCvtI2FLd]>;
2153def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2154                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2155                     [(set VR128:$dst,
2156                       (int_x86_sse2_cvtdq2pd VR128:$src))]>, VEX,
2157                   Sched<[WriteCvtI2F]>;
2158def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
2159                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2160                     [(set VR256:$dst,
2161                       (int_x86_avx_cvtdq2_pd_256
2162                        (bitconvert (loadv2i64 addr:$src))))]>, VEX, VEX_L,
2163                    Sched<[WriteCvtI2FLd]>;
2164def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
2165                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2166                     [(set VR256:$dst,
2167                       (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX, VEX_L,
2168                    Sched<[WriteCvtI2F]>;
2169}
2170
2171let neverHasSideEffects = 1, mayLoad = 1 in
2172def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
2173                       "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
2174                       IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
2175def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2176                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
2177                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))],
2178                       IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>;
2179
2180// AVX 256-bit register conversion intrinsics
2181let Predicates = [HasAVX] in {
2182  def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
2183            (VCVTDQ2PDYrr VR128:$src)>;
2184  def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
2185            (VCVTDQ2PDYrm addr:$src)>;
2186} // Predicates = [HasAVX]
2187
2188// Convert packed double to packed single
2189// The assembler can recognize rr 256-bit instructions by seeing a ymm
2190// register, but the same isn't true when using memory operands instead.
2191// Provide other assembly rr and rm forms to address this explicitly.
2192def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2193                       "cvtpd2ps\t{$src, $dst|$dst, $src}",
2194                       [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
2195                       IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>;
2196
2197// XMM only
2198def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
2199                (VCVTPD2PSrr VR128:$dst, VR128:$src)>;
2200def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2201                        "cvtpd2psx\t{$src, $dst|$dst, $src}",
2202                        [(set VR128:$dst,
2203                          (int_x86_sse2_cvtpd2ps (loadv2f64 addr:$src)))],
2204                        IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
2205
2206// YMM only
2207def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
2208                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
2209                        [(set VR128:$dst,
2210                          (int_x86_avx_cvt_pd2_ps_256 VR256:$src))],
2211                        IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>;
2212def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
2213                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
2214                        [(set VR128:$dst,
2215                          (int_x86_avx_cvt_pd2_ps_256 (loadv4f64 addr:$src)))],
2216                        IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
2217def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}",
2218                (VCVTPD2PSYrr VR128:$dst, VR256:$src)>;
2219
2220def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2221                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
2222                     [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
2223                     IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>;
2224def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2225                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
2226                     [(set VR128:$dst,
2227                       (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))],
2228                     IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>;
2229
2230
2231// AVX 256-bit register conversion intrinsics
2232// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
2233// whenever possible to avoid declaring two versions of each one.
2234let Predicates = [HasAVX] in {
2235  def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
2236            (VCVTDQ2PSYrr VR256:$src)>;
2237  def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (loadv4i64 addr:$src))),
2238            (VCVTDQ2PSYrm addr:$src)>;
2239
2240  // Match fround and fextend for 128/256-bit conversions
2241  def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
2242            (VCVTPD2PSrr VR128:$src)>;
2243  def : Pat<(v4f32 (X86vfpround (loadv2f64 addr:$src))),
2244            (VCVTPD2PSXrm addr:$src)>;
2245  def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
2246            (VCVTPD2PSYrr VR256:$src)>;
2247  def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
2248            (VCVTPD2PSYrm addr:$src)>;
2249
2250  def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
2251            (VCVTPS2PDrr VR128:$src)>;
2252  def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
2253            (VCVTPS2PDYrr VR128:$src)>;
2254  def : Pat<(v4f64 (extloadv4f32 addr:$src)),
2255            (VCVTPS2PDYrm addr:$src)>;
2256}
2257
2258let Predicates = [UseSSE2] in {
2259  // Match fround and fextend for 128 conversions
2260  def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
2261            (CVTPD2PSrr VR128:$src)>;
2262  def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))),
2263            (CVTPD2PSrm addr:$src)>;
2264
2265  def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
2266            (CVTPS2PDrr VR128:$src)>;
2267}
2268
2269//===----------------------------------------------------------------------===//
2270// SSE 1 & 2 - Compare Instructions
2271//===----------------------------------------------------------------------===//
2272
2273// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
2274multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
2275                            Operand CC, SDNode OpNode, ValueType VT,
2276                            PatFrag ld_frag, string asm, string asm_alt,
2277                            OpndItins itins> {
2278  def rr : SIi8<0xC2, MRMSrcReg,
2279                (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
2280                [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))],
2281                itins.rr>, Sched<[itins.Sched]>;
2282  def rm : SIi8<0xC2, MRMSrcMem,
2283                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
2284                [(set RC:$dst, (OpNode (VT RC:$src1),
2285                                         (ld_frag addr:$src2), imm:$cc))],
2286                                         itins.rm>,
2287           Sched<[itins.Sched.Folded, ReadAfterLd]>;
2288
2289  // Accept explicit immediate argument form instead of comparison code.
2290  let neverHasSideEffects = 1 in {
2291    def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
2292                      (ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, [],
2293                      IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>;
2294    let mayLoad = 1 in
2295    def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
2296                      (ins RC:$src1, x86memop:$src2, i8imm:$cc), asm_alt, [],
2297                      IIC_SSE_ALU_F32S_RM>,
2298                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
2299  }
2300}
2301
2302defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmpss, f32, loadf32,
2303                 "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2304                 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2305                 SSE_ALU_F32S>,
2306                 XS, VEX_4V, VEX_LIG;
2307defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmpsd, f64, loadf64,
2308                 "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2309                 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2310                 SSE_ALU_F32S>, // same latency as 32 bit compare
2311                 XD, VEX_4V, VEX_LIG;
2312
2313let Constraints = "$src1 = $dst" in {
2314  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmpss, f32, loadf32,
2315                  "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
2316                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S>,
2317                  XS;
2318  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmpsd, f64, loadf64,
2319                  "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
2320                  "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2321                  SSE_ALU_F64S>,
2322                  XD;
2323}
2324
2325multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC,
2326                         Intrinsic Int, string asm, OpndItins itins> {
2327  def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
2328                      (ins VR128:$src1, VR128:$src, CC:$cc), asm,
2329                        [(set VR128:$dst, (Int VR128:$src1,
2330                                               VR128:$src, imm:$cc))],
2331                                               itins.rr>,
2332           Sched<[itins.Sched]>;
2333  def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
2334                      (ins VR128:$src1, x86memop:$src, CC:$cc), asm,
2335                        [(set VR128:$dst, (Int VR128:$src1,
2336                                               (load addr:$src), imm:$cc))],
2337                                               itins.rm>,
2338           Sched<[itins.Sched.Folded, ReadAfterLd]>;
2339}
2340
2341// Aliases to match intrinsics which expect XMM operand(s).
2342defm Int_VCMPSS  : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss,
2343                     "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
2344                     SSE_ALU_F32S>,
2345                     XS, VEX_4V;
2346defm Int_VCMPSD  : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd,
2347                     "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
2348                     SSE_ALU_F32S>, // same latency as f32
2349                     XD, VEX_4V;
2350let Constraints = "$src1 = $dst" in {
2351  defm Int_CMPSS  : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss,
2352                       "cmp${cc}ss\t{$src, $dst|$dst, $src}",
2353                       SSE_ALU_F32S>, XS;
2354  defm Int_CMPSD  : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd,
2355                       "cmp${cc}sd\t{$src, $dst|$dst, $src}",
2356                       SSE_ALU_F64S>,
2357                       XD;
2358}
2359
2360
2361// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
2362multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
2363                            ValueType vt, X86MemOperand x86memop,
2364                            PatFrag ld_frag, string OpcodeStr> {
2365  def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
2366                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
2367                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
2368                     IIC_SSE_COMIS_RR>,
2369          Sched<[WriteFAdd]>;
2370  def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
2371                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
2372                     [(set EFLAGS, (OpNode (vt RC:$src1),
2373                                           (ld_frag addr:$src2)))],
2374                                           IIC_SSE_COMIS_RM>,
2375          Sched<[WriteFAddLd, ReadAfterLd]>;
2376}
2377
2378let Defs = [EFLAGS] in {
2379  defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
2380                                  "ucomiss">, TB, VEX, VEX_LIG;
2381  defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
2382                                  "ucomisd">, TB, OpSize, VEX, VEX_LIG;
2383  let Pattern = []<dag> in {
2384    defm VCOMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
2385                                    "comiss">, TB, VEX, VEX_LIG;
2386    defm VCOMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
2387                                    "comisd">, TB, OpSize, VEX, VEX_LIG;
2388  }
2389
2390  defm Int_VUCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
2391                            load, "ucomiss">, TB, VEX;
2392  defm Int_VUCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
2393                            load, "ucomisd">, TB, OpSize, VEX;
2394
2395  defm Int_VCOMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
2396                            load, "comiss">, TB, VEX;
2397  defm Int_VCOMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
2398                            load, "comisd">, TB, OpSize, VEX;
2399  defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
2400                                  "ucomiss">, TB;
2401  defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
2402                                  "ucomisd">, TB, OpSize;
2403
2404  let Pattern = []<dag> in {
2405    defm COMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
2406                                    "comiss">, TB;
2407    defm COMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
2408                                    "comisd">, TB, OpSize;
2409  }
2410
2411  defm Int_UCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
2412                              load, "ucomiss">, TB;
2413  defm Int_UCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
2414                              load, "ucomisd">, TB, OpSize;
2415
2416  defm Int_COMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
2417                                  "comiss">, TB;
2418  defm Int_COMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
2419                                  "comisd">, TB, OpSize;
2420} // Defs = [EFLAGS]
2421
2422// sse12_cmp_packed - sse 1 & 2 compare packed instructions
2423multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
2424                            Operand CC, Intrinsic Int, string asm,
2425                            string asm_alt, Domain d,
2426                            OpndItins itins = SSE_ALU_F32P> {
2427  def rri : PIi8<0xC2, MRMSrcReg,
2428             (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
2429             [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))],
2430             itins.rr, d>,
2431            Sched<[WriteFAdd]>;
2432  def rmi : PIi8<0xC2, MRMSrcMem,
2433             (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
2434             [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))],
2435             itins.rm, d>,
2436            Sched<[WriteFAddLd, ReadAfterLd]>;
2437
2438  // Accept explicit immediate argument form instead of comparison code.
2439  let neverHasSideEffects = 1 in {
2440    def rri_alt : PIi8<0xC2, MRMSrcReg,
2441               (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
2442               asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>;
2443    def rmi_alt : PIi8<0xC2, MRMSrcMem,
2444               (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
2445               asm_alt, [], itins.rm, d>,
2446               Sched<[WriteFAddLd, ReadAfterLd]>;
2447  }
2448}
2449
2450defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps,
2451               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2452               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2453               SSEPackedSingle>, TB, VEX_4V;
2454defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd,
2455               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2456               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2457               SSEPackedDouble>, TB, OpSize, VEX_4V;
2458defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256,
2459               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2460               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2461               SSEPackedSingle>, TB, VEX_4V, VEX_L;
2462defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256,
2463               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2464               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2465               SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
2466let Constraints = "$src1 = $dst" in {
2467  defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
2468                 "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
2469                 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2470                 SSEPackedSingle, SSE_ALU_F32P>, TB;
2471  defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd,
2472                 "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
2473                 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2474                 SSEPackedDouble, SSE_ALU_F64P>, TB, OpSize;
2475}
2476
2477let Predicates = [HasAVX] in {
2478def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
2479          (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
2480def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
2481          (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
2482def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
2483          (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
2484def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
2485          (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
2486
2487def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
2488          (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
2489def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (memop addr:$src2), imm:$cc)),
2490          (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
2491def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
2492          (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
2493def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (memop addr:$src2), imm:$cc)),
2494          (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
2495}
2496
2497let Predicates = [UseSSE1] in {
2498def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
2499          (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
2500def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
2501          (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
2502}
2503
2504let Predicates = [UseSSE2] in {
2505def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
2506          (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
2507def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
2508          (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
2509}
2510
2511//===----------------------------------------------------------------------===//
2512// SSE 1 & 2 - Shuffle Instructions
2513//===----------------------------------------------------------------------===//
2514
2515/// sse12_shuffle - sse 1 & 2 shuffle instructions
2516multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
2517                         ValueType vt, string asm, PatFrag mem_frag,
2518                         Domain d, bit IsConvertibleToThreeAddress = 0> {
2519  def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
2520                   (ins RC:$src1, x86memop:$src2, i8imm:$src3), asm,
2521                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
2522                                       (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
2523            Sched<[WriteShuffleLd, ReadAfterLd]>;
2524  let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in
2525    def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
2526                   (ins RC:$src1, RC:$src2, i8imm:$src3), asm,
2527                   [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
2528                                       (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
2529              Sched<[WriteShuffle]>;
2530}
2531
2532defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
2533           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2534           loadv4f32, SSEPackedSingle>, TB, VEX_4V;
2535defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
2536           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2537           loadv8f32, SSEPackedSingle>, TB, VEX_4V, VEX_L;
2538defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
2539           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2540           loadv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
2541defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
2542           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2543           loadv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
2544
2545let Constraints = "$src1 = $dst" in {
2546  defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2547                    "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2548                    memopv4f32, SSEPackedSingle, 1 /* cvt to pshufd */>,
2549                    TB;
2550  defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2551                    "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2552                    memopv2f64, SSEPackedDouble, 1 /* cvt to pshufd */>,
2553                    TB, OpSize;
2554}
2555
2556let Predicates = [HasAVX] in {
2557  def : Pat<(v4i32 (X86Shufp VR128:$src1,
2558                       (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))),
2559            (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
2560  def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2561            (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
2562
2563  def : Pat<(v2i64 (X86Shufp VR128:$src1,
2564                       (loadv2i64 addr:$src2), (i8 imm:$imm))),
2565            (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
2566  def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2567            (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
2568
2569  // 256-bit patterns
2570  def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
2571            (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
2572  def : Pat<(v8i32 (X86Shufp VR256:$src1,
2573                      (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
2574            (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
2575
2576  def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
2577            (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
2578  def : Pat<(v4i64 (X86Shufp VR256:$src1,
2579                              (loadv4i64 addr:$src2), (i8 imm:$imm))),
2580            (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
2581}
2582
2583let Predicates = [UseSSE1] in {
2584  def : Pat<(v4i32 (X86Shufp VR128:$src1,
2585                       (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
2586            (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
2587  def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2588            (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
2589}
2590
2591let Predicates = [UseSSE2] in {
2592  // Generic SHUFPD patterns
2593  def : Pat<(v2i64 (X86Shufp VR128:$src1,
2594                       (memopv2i64 addr:$src2), (i8 imm:$imm))),
2595            (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
2596  def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2597            (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
2598}
2599
2600//===----------------------------------------------------------------------===//
2601// SSE 1 & 2 - Unpack Instructions
2602//===----------------------------------------------------------------------===//
2603
2604/// sse12_unpack_interleave - sse 1 & 2 unpack and interleave
2605multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
2606                                   PatFrag mem_frag, RegisterClass RC,
2607                                   X86MemOperand x86memop, string asm,
2608                                   Domain d> {
2609    def rr : PI<opc, MRMSrcReg,
2610                (outs RC:$dst), (ins RC:$src1, RC:$src2),
2611                asm, [(set RC:$dst,
2612                           (vt (OpNode RC:$src1, RC:$src2)))],
2613                           IIC_SSE_UNPCK, d>, Sched<[WriteShuffle]>;
2614    def rm : PI<opc, MRMSrcMem,
2615                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2616                asm, [(set RC:$dst,
2617                           (vt (OpNode RC:$src1,
2618                                       (mem_frag addr:$src2))))],
2619                                       IIC_SSE_UNPCK, d>,
2620             Sched<[WriteShuffleLd, ReadAfterLd]>;
2621}
2622
2623defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
2624      VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2625                     SSEPackedSingle>, TB, VEX_4V;
2626defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
2627      VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2628                     SSEPackedDouble>, TB, OpSize, VEX_4V;
2629defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
2630      VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2631                     SSEPackedSingle>, TB, VEX_4V;
2632defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
2633      VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2634                     SSEPackedDouble>, TB, OpSize, VEX_4V;
2635
2636defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
2637      VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2638                     SSEPackedSingle>, TB, VEX_4V, VEX_L;
2639defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
2640      VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2641                     SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
2642defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
2643      VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2644                     SSEPackedSingle>, TB, VEX_4V, VEX_L;
2645defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
2646      VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2647                     SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
2648
2649let Constraints = "$src1 = $dst" in {
2650  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
2651        VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
2652                       SSEPackedSingle>, TB;
2653  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
2654        VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
2655                       SSEPackedDouble>, TB, OpSize;
2656  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
2657        VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
2658                       SSEPackedSingle>, TB;
2659  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
2660        VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
2661                       SSEPackedDouble>, TB, OpSize;
2662} // Constraints = "$src1 = $dst"
2663
2664let Predicates = [HasAVX1Only] in {
2665  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
2666            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2667  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
2668            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2669  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
2670            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2671  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
2672            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2673
2674  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
2675            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2676  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
2677            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2678  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
2679            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2680  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
2681            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2682}
2683
2684let Predicates = [HasAVX] in {
2685  // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
2686  // problem is during lowering, where it's not possible to recognize the load
2687  // fold cause it has two uses through a bitcast. One use disappears at isel
2688  // time and the fold opportunity reappears.
2689  def : Pat<(v2f64 (X86Movddup VR128:$src)),
2690            (VUNPCKLPDrr VR128:$src, VR128:$src)>;
2691}
2692
2693let Predicates = [UseSSE2] in {
2694  // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
2695  // problem is during lowering, where it's not possible to recognize the load
2696  // fold cause it has two uses through a bitcast. One use disappears at isel
2697  // time and the fold opportunity reappears.
2698  def : Pat<(v2f64 (X86Movddup VR128:$src)),
2699            (UNPCKLPDrr VR128:$src, VR128:$src)>;
2700}
2701
2702//===----------------------------------------------------------------------===//
2703// SSE 1 & 2 - Extract Floating-Point Sign mask
2704//===----------------------------------------------------------------------===//
2705
2706/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
2707multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
2708                                Domain d> {
2709  def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
2710              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
2711              [(set GR32orGR64:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>,
2712              Sched<[WriteVecLogic]>;
2713}
2714
2715let Predicates = [HasAVX] in {
2716  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
2717                                        "movmskps", SSEPackedSingle>, TB, VEX;
2718  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
2719                                        "movmskpd", SSEPackedDouble>, TB,
2720                                        OpSize, VEX;
2721  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
2722                                        "movmskps", SSEPackedSingle>, TB,
2723                                        VEX, VEX_L;
2724  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
2725                                        "movmskpd", SSEPackedDouble>, TB,
2726                                        OpSize, VEX, VEX_L;
2727
2728  def : Pat<(i32 (X86fgetsign FR32:$src)),
2729            (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
2730  def : Pat<(i64 (X86fgetsign FR32:$src)),
2731            (SUBREG_TO_REG (i64 0),
2732             (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>;
2733  def : Pat<(i32 (X86fgetsign FR64:$src)),
2734            (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
2735  def : Pat<(i64 (X86fgetsign FR64:$src)),
2736            (SUBREG_TO_REG (i64 0),
2737             (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>;
2738}
2739
2740defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
2741                                     SSEPackedSingle>, TB;
2742defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
2743                                     SSEPackedDouble>, TB, OpSize;
2744
2745def : Pat<(i32 (X86fgetsign FR32:$src)),
2746          (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>,
2747      Requires<[UseSSE1]>;
2748def : Pat<(i64 (X86fgetsign FR32:$src)),
2749          (SUBREG_TO_REG (i64 0),
2750           (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>,
2751      Requires<[UseSSE1]>;
2752def : Pat<(i32 (X86fgetsign FR64:$src)),
2753          (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>,
2754      Requires<[UseSSE2]>;
2755def : Pat<(i64 (X86fgetsign FR64:$src)),
2756          (SUBREG_TO_REG (i64 0),
2757           (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>,
2758      Requires<[UseSSE2]>;
2759
2760//===---------------------------------------------------------------------===//
2761// SSE2 - Packed Integer Logical Instructions
2762//===---------------------------------------------------------------------===//
2763
2764let ExeDomain = SSEPackedInt in { // SSE integer instructions
2765
2766/// PDI_binop_rm - Simple SSE2 binary operator.
2767multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
2768                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
2769                        X86MemOperand x86memop, OpndItins itins,
2770                        bit IsCommutable, bit Is2Addr> {
2771  let isCommutable = IsCommutable in
2772  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
2773       (ins RC:$src1, RC:$src2),
2774       !if(Is2Addr,
2775           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2776           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2777       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
2778       Sched<[itins.Sched]>;
2779  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
2780       (ins RC:$src1, x86memop:$src2),
2781       !if(Is2Addr,
2782           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2783           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2784       [(set RC:$dst, (OpVT (OpNode RC:$src1,
2785                                     (bitconvert (memop_frag addr:$src2)))))],
2786                                     itins.rm>,
2787       Sched<[itins.Sched.Folded, ReadAfterLd]>;
2788}
2789} // ExeDomain = SSEPackedInt
2790
2791multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
2792                         ValueType OpVT128, ValueType OpVT256,
2793                         OpndItins itins, bit IsCommutable = 0> {
2794let Predicates = [HasAVX] in
2795  defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
2796                    VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
2797
2798let Constraints = "$src1 = $dst" in
2799  defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
2800                           memopv2i64, i128mem, itins, IsCommutable, 1>;
2801
2802let Predicates = [HasAVX2] in
2803  defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
2804                               OpVT256, VR256, loadv4i64, i256mem, itins,
2805                               IsCommutable, 0>, VEX_4V, VEX_L;
2806}
2807
2808// These are ordered here for pattern ordering requirements with the fp versions
2809
2810defm PAND  : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, SSE_BIT_ITINS_P, 1>;
2811defm POR   : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, SSE_BIT_ITINS_P, 1>;
2812defm PXOR  : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, SSE_BIT_ITINS_P, 1>;
2813defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
2814                           SSE_BIT_ITINS_P, 0>;
2815
2816//===----------------------------------------------------------------------===//
2817// SSE 1 & 2 - Logical Instructions
2818//===----------------------------------------------------------------------===//
2819
2820/// sse12_fp_alias_pack_logical - SSE 1 & 2 aliased packed FP logical ops
2821///
2822multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr,
2823                                       SDNode OpNode, OpndItins itins> {
2824  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2825              FR32, f32, f128mem, memopfsf32, SSEPackedSingle, itins, 0>,
2826              TB, VEX_4V;
2827
2828  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2829        FR64, f64, f128mem, memopfsf64, SSEPackedDouble, itins, 0>,
2830        TB, OpSize, VEX_4V;
2831
2832  let Constraints = "$src1 = $dst" in {
2833    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
2834                f32, f128mem, memopfsf32, SSEPackedSingle, itins>,
2835                TB;
2836
2837    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64,
2838                f64, f128mem, memopfsf64, SSEPackedDouble, itins>,
2839                TB, OpSize;
2840  }
2841}
2842
2843// Alias bitwise logical operations using SSE logical ops on packed FP values.
2844let isCodeGenOnly = 1 in {
2845  defm FsAND  : sse12_fp_alias_pack_logical<0x54, "and", X86fand,
2846                SSE_BIT_ITINS_P>;
2847  defm FsOR   : sse12_fp_alias_pack_logical<0x56, "or", X86for,
2848                SSE_BIT_ITINS_P>;
2849  defm FsXOR  : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
2850                SSE_BIT_ITINS_P>;
2851
2852  let isCommutable = 0 in
2853    defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", X86fandn,
2854                  SSE_BIT_ITINS_P>;
2855}
2856
2857/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
2858///
2859multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
2860                                   SDNode OpNode> {
2861  defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
2862        !strconcat(OpcodeStr, "ps"), f256mem,
2863        [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
2864        [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
2865                           (loadv4i64 addr:$src2)))], 0>, TB, VEX_4V, VEX_L;
2866
2867  defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
2868        !strconcat(OpcodeStr, "pd"), f256mem,
2869        [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
2870                                  (bc_v4i64 (v4f64 VR256:$src2))))],
2871        [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
2872                                  (loadv4i64 addr:$src2)))], 0>,
2873                                  TB, OpSize, VEX_4V, VEX_L;
2874
2875  // In AVX no need to add a pattern for 128-bit logical rr ps, because they
2876  // are all promoted to v2i64, and the patterns are covered by the int
2877  // version. This is needed in SSE only, because v2i64 isn't supported on
2878  // SSE1, but only on SSE2.
2879  defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2880       !strconcat(OpcodeStr, "ps"), f128mem, [],
2881       [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
2882                                 (loadv2i64 addr:$src2)))], 0>, TB, VEX_4V;
2883
2884  defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2885       !strconcat(OpcodeStr, "pd"), f128mem,
2886       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2887                                 (bc_v2i64 (v2f64 VR128:$src2))))],
2888       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2889                                 (loadv2i64 addr:$src2)))], 0>,
2890                                                 TB, OpSize, VEX_4V;
2891
2892  let Constraints = "$src1 = $dst" in {
2893    defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2894         !strconcat(OpcodeStr, "ps"), f128mem,
2895         [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))],
2896         [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
2897                                   (memopv2i64 addr:$src2)))]>, TB;
2898
2899    defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2900         !strconcat(OpcodeStr, "pd"), f128mem,
2901         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2902                                   (bc_v2i64 (v2f64 VR128:$src2))))],
2903         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2904                                   (memopv2i64 addr:$src2)))]>, TB, OpSize;
2905  }
2906}
2907
2908defm AND  : sse12_fp_packed_logical<0x54, "and", and>;
2909defm OR   : sse12_fp_packed_logical<0x56, "or", or>;
2910defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor>;
2911let isCommutable = 0 in
2912  defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;
2913
2914//===----------------------------------------------------------------------===//
2915// SSE 1 & 2 - Arithmetic Instructions
2916//===----------------------------------------------------------------------===//
2917
2918/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
2919/// vector forms.
2920///
2921/// In addition, we also have a special variant of the scalar form here to
2922/// represent the associated intrinsic operation.  This form is unlike the
2923/// plain scalar form, in that it takes an entire vector (instead of a scalar)
2924/// and leaves the top elements unmodified (therefore these cannot be commuted).
2925///
2926/// These three forms can each be reg+reg or reg+mem.
2927///
2928
2929/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
2930/// classes below
2931multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
2932                                  SDNode OpNode, SizeItins itins> {
2933  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2934                               VR128, v4f32, f128mem, loadv4f32,
2935                               SSEPackedSingle, itins.s, 0>, TB, VEX_4V;
2936  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2937                               VR128, v2f64, f128mem, loadv2f64,
2938                               SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_4V;
2939
2940  defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
2941                        OpNode, VR256, v8f32, f256mem, loadv8f32,
2942                        SSEPackedSingle, itins.s, 0>, TB, VEX_4V, VEX_L;
2943  defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
2944                        OpNode, VR256, v4f64, f256mem, loadv4f64,
2945                        SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_4V, VEX_L;
2946
2947  let Constraints = "$src1 = $dst" in {
2948    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
2949                              v4f32, f128mem, memopv4f32, SSEPackedSingle,
2950                              itins.s>, TB;
2951    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
2952                              v2f64, f128mem, memopv2f64, SSEPackedDouble,
2953                              itins.d>, TB, OpSize;
2954  }
2955}
2956
2957multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2958                                  SizeItins itins> {
2959  defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2960                         OpNode, FR32, f32mem, itins.s, 0>, XS, VEX_4V, VEX_LIG;
2961  defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2962                         OpNode, FR64, f64mem, itins.d, 0>, XD, VEX_4V, VEX_LIG;
2963
2964  let Constraints = "$src1 = $dst" in {
2965    defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2966                              OpNode, FR32, f32mem, itins.s>, XS;
2967    defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2968                              OpNode, FR64, f64mem, itins.d>, XD;
2969  }
2970}
2971
2972multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
2973                                      SizeItins itins> {
2974  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
2975                   !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
2976                   itins.s, 0>, XS, VEX_4V, VEX_LIG;
2977  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
2978                   !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
2979                   itins.d, 0>, XD, VEX_4V, VEX_LIG;
2980
2981  let Constraints = "$src1 = $dst" in {
2982    defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
2983                   !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
2984                   itins.s>, XS;
2985    defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
2986                   !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
2987                   itins.d>, XD;
2988  }
2989}
2990
2991// Binary Arithmetic instructions
2992defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
2993           basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
2994           basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>;
2995defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
2996           basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
2997           basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>;
2998let isCommutable = 0 in {
2999  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
3000             basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
3001             basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>;
3002  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
3003             basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
3004             basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>;
3005  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
3006             basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
3007             basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>;
3008  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
3009             basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
3010             basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>;
3011}
3012
3013let isCodeGenOnly = 1 in {
3014  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>,
3015             basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>;
3016  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>,
3017             basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
3018}
3019
3020/// Unop Arithmetic
3021/// In addition, we also have a special variant of the scalar form here to
3022/// represent the associated intrinsic operation.  This form is unlike the
3023/// plain scalar form, in that it takes an entire vector (instead of a
3024/// scalar) and leaves the top elements undefined.
3025///
3026/// And, we have a special variant form for a full-vector intrinsic form.
3027
3028let Sched = WriteFSqrt in {
3029def SSE_SQRTPS : OpndItins<
3030  IIC_SSE_SQRTPS_RR, IIC_SSE_SQRTPS_RM
3031>;
3032
3033def SSE_SQRTSS : OpndItins<
3034  IIC_SSE_SQRTSS_RR, IIC_SSE_SQRTSS_RM
3035>;
3036
3037def SSE_SQRTPD : OpndItins<
3038  IIC_SSE_SQRTPD_RR, IIC_SSE_SQRTPD_RM
3039>;
3040
3041def SSE_SQRTSD : OpndItins<
3042  IIC_SSE_SQRTSD_RR, IIC_SSE_SQRTSD_RM
3043>;
3044}
3045
3046let Sched = WriteFRcp in {
3047def SSE_RCPP : OpndItins<
3048  IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
3049>;
3050
3051def SSE_RCPS : OpndItins<
3052  IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM
3053>;
3054}
3055
3056/// sse1_fp_unop_s - SSE1 unops in scalar form.
3057multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr,
3058                          SDNode OpNode, Intrinsic F32Int, OpndItins itins> {
3059let Predicates = [HasAVX], hasSideEffects = 0 in {
3060  def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst),
3061                      (ins FR32:$src1, FR32:$src2),
3062                      !strconcat("v", OpcodeStr,
3063                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3064                      []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>;
3065  let mayLoad = 1 in {
3066  def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
3067                      (ins FR32:$src1,f32mem:$src2),
3068                      !strconcat("v", OpcodeStr,
3069                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3070                      []>, VEX_4V, VEX_LIG,
3071                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
3072  def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
3073                      (ins VR128:$src1, ssmem:$src2),
3074                      !strconcat("v", OpcodeStr,
3075                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3076                      []>, VEX_4V, VEX_LIG,
3077                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
3078  }
3079}
3080
3081  def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
3082                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
3083                [(set FR32:$dst, (OpNode FR32:$src))]>, Sched<[itins.Sched]>;
3084  // For scalar unary operations, fold a load into the operation
3085  // only in OptForSize mode. It eliminates an instruction, but it also
3086  // eliminates a whole-register clobber (the load), so it introduces a
3087  // partial register update condition.
3088  def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
3089                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
3090                [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
3091            Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>;
3092  def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3093                    !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
3094                    [(set VR128:$dst, (F32Int VR128:$src))], itins.rr>,
3095                Sched<[itins.Sched]>;
3096  def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src),
3097                    !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
3098                    [(set VR128:$dst, (F32Int sse_load_f32:$src))], itins.rm>,
3099                Sched<[itins.Sched.Folded]>;
3100}
3101
3102/// sse1_fp_unop_s_rw - SSE1 unops where vector form has a read-write operand.
3103multiclass sse1_fp_unop_rw<bits<8> opc, string OpcodeStr, SDNode OpNode,
3104                           OpndItins itins> {
3105let Predicates = [HasAVX], hasSideEffects = 0 in {
3106  def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst),
3107                       (ins FR32:$src1, FR32:$src2),
3108                       !strconcat("v", OpcodeStr,
3109                           "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3110                []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>;
3111  let mayLoad = 1 in {
3112  def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
3113                      (ins FR32:$src1,f32mem:$src2),
3114                      !strconcat("v", OpcodeStr,
3115                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3116                      []>, VEX_4V, VEX_LIG,
3117                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
3118  def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
3119                      (ins VR128:$src1, ssmem:$src2),
3120                      !strconcat("v", OpcodeStr,
3121                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3122                      []>, VEX_4V, VEX_LIG,
3123                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
3124  }
3125}
3126
3127  def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
3128                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
3129                [(set FR32:$dst, (OpNode FR32:$src))]>, Sched<[itins.Sched]>;
3130  // For scalar unary operations, fold a load into the operation
3131  // only in OptForSize mode. It eliminates an instruction, but it also
3132  // eliminates a whole-register clobber (the load), so it introduces a
3133  // partial register update condition.
3134  def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
3135                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
3136                [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
3137            Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>;
3138  let Constraints = "$src1 = $dst" in {
3139    def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
3140                      (ins VR128:$src1, VR128:$src2),
3141                      !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
3142                      [], itins.rr>, Sched<[itins.Sched]>;
3143    let mayLoad = 1, hasSideEffects = 0 in
3144    def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
3145                      (ins VR128:$src1, ssmem:$src2),
3146                      !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
3147                      [], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
3148  }
3149}
3150
3151/// sse1_fp_unop_p - SSE1 unops in packed form.
3152multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
3153                          OpndItins itins> {
3154let Predicates = [HasAVX] in {
3155  def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3156                       !strconcat("v", OpcodeStr,
3157                                  "ps\t{$src, $dst|$dst, $src}"),
3158                       [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))],
3159                       itins.rr>, VEX, Sched<[itins.Sched]>;
3160  def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3161                       !strconcat("v", OpcodeStr,
3162                                  "ps\t{$src, $dst|$dst, $src}"),
3163                       [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))],
3164                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
3165  def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3166                        !strconcat("v", OpcodeStr,
3167                                   "ps\t{$src, $dst|$dst, $src}"),
3168                        [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
3169                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
3170  def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
3171                        !strconcat("v", OpcodeStr,
3172                                   "ps\t{$src, $dst|$dst, $src}"),
3173                        [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))],
3174                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
3175}
3176
3177  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3178                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3179                [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>,
3180            Sched<[itins.Sched]>;
3181  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3182                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3183                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>,
3184            Sched<[itins.Sched.Folded]>;
3185}
3186
3187/// sse1_fp_unop_p_int - SSE1 intrinsics unops in packed forms.
3188multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr,
3189                              Intrinsic V4F32Int, Intrinsic V8F32Int,
3190                              OpndItins itins> {
3191let Predicates = [HasAVX] in {
3192  def V#NAME#PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3193                           !strconcat("v", OpcodeStr,
3194                                      "ps\t{$src, $dst|$dst, $src}"),
3195                           [(set VR128:$dst, (V4F32Int VR128:$src))],
3196                           itins.rr>, VEX, Sched<[itins.Sched]>;
3197  def V#NAME#PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3198                          !strconcat("v", OpcodeStr,
3199                          "ps\t{$src, $dst|$dst, $src}"),
3200                          [(set VR128:$dst, (V4F32Int (loadv4f32 addr:$src)))],
3201                          itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
3202  def V#NAME#PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3203                            !strconcat("v", OpcodeStr,
3204                                       "ps\t{$src, $dst|$dst, $src}"),
3205                            [(set VR256:$dst, (V8F32Int VR256:$src))],
3206                            itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
3207  def V#NAME#PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst),
3208                          (ins f256mem:$src),
3209                          !strconcat("v", OpcodeStr,
3210                                    "ps\t{$src, $dst|$dst, $src}"),
3211                          [(set VR256:$dst, (V8F32Int (loadv8f32 addr:$src)))],
3212                          itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
3213}
3214
3215  def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3216                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3217                    [(set VR128:$dst, (V4F32Int VR128:$src))],
3218                    itins.rr>, Sched<[itins.Sched]>;
3219  def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3220                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3221                    [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))],
3222                    itins.rm>, Sched<[itins.Sched.Folded]>;
3223}
3224
3225/// sse2_fp_unop_s - SSE2 unops in scalar form.
3226multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr,
3227                          SDNode OpNode, Intrinsic F64Int, OpndItins itins> {
3228let Predicates = [HasAVX], hasSideEffects = 0 in {
3229  def V#NAME#SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst),
3230                      (ins FR64:$src1, FR64:$src2),
3231                      !strconcat("v", OpcodeStr,
3232                                 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3233                      []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>;
3234  let mayLoad = 1 in {
3235  def V#NAME#SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
3236                      (ins FR64:$src1,f64mem:$src2),
3237                      !strconcat("v", OpcodeStr,
3238                                 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3239                      []>, VEX_4V, VEX_LIG,
3240                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
3241  def V#NAME#SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
3242                      (ins VR128:$src1, sdmem:$src2),
3243                      !strconcat("v", OpcodeStr,
3244                                 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3245                      []>, VEX_4V, VEX_LIG,
3246                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
3247  }
3248}
3249
3250  def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
3251                !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
3252                [(set FR64:$dst, (OpNode FR64:$src))], itins.rr>,
3253            Sched<[itins.Sched]>;
3254  // See the comments in sse1_fp_unop_s for why this is OptForSize.
3255  def SDm : I<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
3256                !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
3257                [(set FR64:$dst, (OpNode (load addr:$src)))], itins.rm>, XD,
3258            Requires<[UseSSE2, OptForSize]>, Sched<[itins.Sched.Folded]>;
3259  def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3260                    !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
3261                    [(set VR128:$dst, (F64Int VR128:$src))], itins.rr>,
3262                Sched<[itins.Sched]>;
3263  def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src),
3264                    !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
3265                    [(set VR128:$dst, (F64Int sse_load_f64:$src))], itins.rm>,
3266                Sched<[itins.Sched.Folded]>;
3267}
3268
3269/// sse2_fp_unop_p - SSE2 unops in vector forms.
3270multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
3271                          SDNode OpNode, OpndItins itins> {
3272let Predicates = [HasAVX] in {
3273  def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3274                       !strconcat("v", OpcodeStr,
3275                                  "pd\t{$src, $dst|$dst, $src}"),
3276                       [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))],
3277                       itins.rr>, VEX, Sched<[itins.Sched]>;
3278  def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3279                       !strconcat("v", OpcodeStr,
3280                                  "pd\t{$src, $dst|$dst, $src}"),
3281                       [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))],
3282                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
3283  def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3284                        !strconcat("v", OpcodeStr,
3285                                   "pd\t{$src, $dst|$dst, $src}"),
3286                        [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
3287                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
3288  def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
3289                        !strconcat("v", OpcodeStr,
3290                                   "pd\t{$src, $dst|$dst, $src}"),
3291                        [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))],
3292                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
3293}
3294
3295  def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3296              !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3297              [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>,
3298            Sched<[itins.Sched]>;
3299  def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3300                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3301                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>,
3302            Sched<[itins.Sched.Folded]>;
3303}
3304
3305// Square root.
3306defm SQRT  : sse1_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse_sqrt_ss,
3307                            SSE_SQRTSS>,
3308             sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>,
3309             sse2_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse2_sqrt_sd,
3310                            SSE_SQRTSD>,
3311             sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;
3312
3313// Reciprocal approximations. Note that these typically require refinement
3314// in order to obtain suitable precision.
3315defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTSS>,
3316             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTPS>,
3317             sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
3318                                int_x86_avx_rsqrt_ps_256, SSE_SQRTPS>;
3319defm RCP   : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>,
3320             sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>,
3321             sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps,
3322                                int_x86_avx_rcp_ps_256, SSE_RCPP>;
3323
3324let Predicates = [UseAVX] in {
3325  def : Pat<(f32 (fsqrt FR32:$src)),
3326            (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
3327  def : Pat<(f32 (fsqrt (load addr:$src))),
3328            (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
3329            Requires<[HasAVX, OptForSize]>;
3330  def : Pat<(f64 (fsqrt FR64:$src)),
3331            (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>;
3332  def : Pat<(f64 (fsqrt (load addr:$src))),
3333            (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>,
3334            Requires<[HasAVX, OptForSize]>;
3335
3336  def : Pat<(f32 (X86frsqrt FR32:$src)),
3337            (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
3338  def : Pat<(f32 (X86frsqrt (load addr:$src))),
3339            (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
3340            Requires<[HasAVX, OptForSize]>;
3341
3342  def : Pat<(f32 (X86frcp FR32:$src)),
3343            (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
3344  def : Pat<(f32 (X86frcp (load addr:$src))),
3345            (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
3346            Requires<[HasAVX, OptForSize]>;
3347}
3348let Predicates = [UseAVX] in {
3349  def : Pat<(int_x86_sse_sqrt_ss VR128:$src),
3350            (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)),
3351                                        (COPY_TO_REGCLASS VR128:$src, FR32)),
3352                              VR128)>;
3353  def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src),
3354            (VSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
3355
3356  def : Pat<(int_x86_sse2_sqrt_sd VR128:$src),
3357            (COPY_TO_REGCLASS (VSQRTSDr (f64 (IMPLICIT_DEF)),
3358                                        (COPY_TO_REGCLASS VR128:$src, FR64)),
3359                              VR128)>;
3360  def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src),
3361            (VSQRTSDm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>;
3362}
3363
3364let Predicates = [HasAVX] in {
3365  def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
3366            (COPY_TO_REGCLASS (VRSQRTSSr (f32 (IMPLICIT_DEF)),
3367                                         (COPY_TO_REGCLASS VR128:$src, FR32)),
3368                              VR128)>;
3369  def : Pat<(int_x86_sse_rsqrt_ss sse_load_f32:$src),
3370            (VRSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
3371
3372  def : Pat<(int_x86_sse_rcp_ss VR128:$src),
3373            (COPY_TO_REGCLASS (VRCPSSr (f32 (IMPLICIT_DEF)),
3374                                       (COPY_TO_REGCLASS VR128:$src, FR32)),
3375                              VR128)>;
3376  def : Pat<(int_x86_sse_rcp_ss sse_load_f32:$src),
3377            (VRCPSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
3378}
3379
3380// Reciprocal approximations. Note that these typically require refinement
3381// in order to obtain suitable precision.
3382let Predicates = [UseSSE1] in {
3383  def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
3384            (RSQRTSSr_Int VR128:$src, VR128:$src)>;
3385  def : Pat<(int_x86_sse_rcp_ss VR128:$src),
3386            (RCPSSr_Int VR128:$src, VR128:$src)>;
3387}
3388
3389// There is no f64 version of the reciprocal approximation instructions.
3390
3391//===----------------------------------------------------------------------===//
3392// SSE 1 & 2 - Non-temporal stores
3393//===----------------------------------------------------------------------===//
3394
3395let AddedComplexity = 400 in { // Prefer non-temporal versions
3396let SchedRW = [WriteStore] in {
3397def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
3398                     (ins f128mem:$dst, VR128:$src),
3399                     "movntps\t{$src, $dst|$dst, $src}",
3400                     [(alignednontemporalstore (v4f32 VR128:$src),
3401                                               addr:$dst)],
3402                                               IIC_SSE_MOVNT>, VEX;
3403def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
3404                     (ins f128mem:$dst, VR128:$src),
3405                     "movntpd\t{$src, $dst|$dst, $src}",
3406                     [(alignednontemporalstore (v2f64 VR128:$src),
3407                                               addr:$dst)],
3408                                               IIC_SSE_MOVNT>, VEX;
3409
3410let ExeDomain = SSEPackedInt in
3411def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
3412                         (ins f128mem:$dst, VR128:$src),
3413                         "movntdq\t{$src, $dst|$dst, $src}",
3414                         [(alignednontemporalstore (v2i64 VR128:$src),
3415                                                   addr:$dst)],
3416                                                   IIC_SSE_MOVNT>, VEX;
3417
3418def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
3419                     (ins f256mem:$dst, VR256:$src),
3420                     "movntps\t{$src, $dst|$dst, $src}",
3421                     [(alignednontemporalstore (v8f32 VR256:$src),
3422                                               addr:$dst)],
3423                                               IIC_SSE_MOVNT>, VEX, VEX_L;
3424def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
3425                     (ins f256mem:$dst, VR256:$src),
3426                     "movntpd\t{$src, $dst|$dst, $src}",
3427                     [(alignednontemporalstore (v4f64 VR256:$src),
3428                                               addr:$dst)],
3429                                               IIC_SSE_MOVNT>, VEX, VEX_L;
3430let ExeDomain = SSEPackedInt in
3431def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
3432                    (ins f256mem:$dst, VR256:$src),
3433                    "movntdq\t{$src, $dst|$dst, $src}",
3434                    [(alignednontemporalstore (v4i64 VR256:$src),
3435                                              addr:$dst)],
3436                                              IIC_SSE_MOVNT>, VEX, VEX_L;
3437
3438def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3439                    "movntps\t{$src, $dst|$dst, $src}",
3440                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)],
3441                    IIC_SSE_MOVNT>;
3442def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3443                    "movntpd\t{$src, $dst|$dst, $src}",
3444                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)],
3445                    IIC_SSE_MOVNT>;
3446
3447let ExeDomain = SSEPackedInt in
3448def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3449                    "movntdq\t{$src, $dst|$dst, $src}",
3450                    [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)],
3451                    IIC_SSE_MOVNT>;
3452
3453// There is no AVX form for instructions below this point
3454def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
3455                 "movnti{l}\t{$src, $dst|$dst, $src}",
3456                 [(nontemporalstore (i32 GR32:$src), addr:$dst)],
3457                 IIC_SSE_MOVNT>,
3458               TB, Requires<[HasSSE2]>;
3459def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
3460                     "movnti{q}\t{$src, $dst|$dst, $src}",
3461                     [(nontemporalstore (i64 GR64:$src), addr:$dst)],
3462                     IIC_SSE_MOVNT>,
3463                  TB, Requires<[HasSSE2]>;
3464} // SchedRW = [WriteStore]
3465
3466def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
3467          (VMOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasAVX]>;
3468
3469def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
3470          (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>;
3471} // AddedComplexity
3472
3473//===----------------------------------------------------------------------===//
3474// SSE 1 & 2 - Prefetch and memory fence
3475//===----------------------------------------------------------------------===//
3476
3477// Prefetch intrinsic.
3478let Predicates = [HasSSE1], SchedRW = [WriteLoad] in {
3479def PREFETCHT0   : I<0x18, MRM1m, (outs), (ins i8mem:$src),
3480    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))],
3481    IIC_SSE_PREFETCH>, TB;
3482def PREFETCHT1   : I<0x18, MRM2m, (outs), (ins i8mem:$src),
3483    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))],
3484    IIC_SSE_PREFETCH>, TB;
3485def PREFETCHT2   : I<0x18, MRM3m, (outs), (ins i8mem:$src),
3486    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))],
3487    IIC_SSE_PREFETCH>, TB;
3488def PREFETCHNTA  : I<0x18, MRM0m, (outs), (ins i8mem:$src),
3489    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))],
3490    IIC_SSE_PREFETCH>, TB;
3491}
3492
3493// FIXME: How should these memory instructions be modeled?
3494let SchedRW = [WriteLoad] in {
3495// Flush cache
3496def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
3497               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)],
3498               IIC_SSE_PREFETCH>, TB, Requires<[HasSSE2]>;
3499
3500// Pause. This "instruction" is encoded as "rep; nop", so even though it
3501// was introduced with SSE2, it's backward compatible.
3502def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", [], IIC_SSE_PAUSE>, REP;
3503
3504// Load, store, and memory fence
3505def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
3506               "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>,
3507               TB, Requires<[HasSSE1]>;
3508def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
3509               "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>,
3510               TB, Requires<[HasSSE2]>;
3511def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
3512               "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>,
3513               TB, Requires<[HasSSE2]>;
3514} // SchedRW
3515
3516def : Pat<(X86SFence), (SFENCE)>;
3517def : Pat<(X86LFence), (LFENCE)>;
3518def : Pat<(X86MFence), (MFENCE)>;
3519
3520//===----------------------------------------------------------------------===//
3521// SSE 1 & 2 - Load/Store XCSR register
3522//===----------------------------------------------------------------------===//
3523
3524def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3525                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
3526                  IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>;
3527def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3528                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
3529                  IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>;
3530
3531def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3532                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
3533                  IIC_SSE_LDMXCSR>, Sched<[WriteLoad]>;
3534def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3535                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
3536                  IIC_SSE_STMXCSR>, Sched<[WriteStore]>;
3537
3538//===---------------------------------------------------------------------===//
3539// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
3540//===---------------------------------------------------------------------===//
3541
3542let ExeDomain = SSEPackedInt in { // SSE integer instructions
3543
3544let neverHasSideEffects = 1, SchedRW = [WriteMove] in {
3545def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3546                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
3547                    VEX;
3548def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3549                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
3550                    VEX, VEX_L;
3551def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3552                    "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
3553                    VEX;
3554def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3555                    "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
3556                    VEX, VEX_L;
3557}
3558
3559// For Disassembler
3560let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in {
3561def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3562                        "movdqa\t{$src, $dst|$dst, $src}", [],
3563                        IIC_SSE_MOVA_P_RR>,
3564                        VEX;
3565def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3566                        "movdqa\t{$src, $dst|$dst, $src}", [],
3567                        IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
3568def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3569                        "movdqu\t{$src, $dst|$dst, $src}", [],
3570                        IIC_SSE_MOVU_P_RR>,
3571                        VEX;
3572def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3573                        "movdqu\t{$src, $dst|$dst, $src}", [],
3574                        IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
3575}
3576
3577let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3578    neverHasSideEffects = 1, SchedRW = [WriteLoad] in {
3579def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3580                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
3581                   VEX;
3582def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3583                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
3584                   VEX, VEX_L;
3585let Predicates = [HasAVX] in {
3586  def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3587                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
3588                    XS, VEX;
3589  def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3590                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
3591                    XS, VEX, VEX_L;
3592}
3593}
3594
3595let mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in {
3596def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
3597                     (ins i128mem:$dst, VR128:$src),
3598                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
3599                     VEX;
3600def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
3601                     (ins i256mem:$dst, VR256:$src),
3602                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
3603                     VEX, VEX_L;
3604let Predicates = [HasAVX] in {
3605def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3606                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
3607                  XS, VEX;
3608def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
3609                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
3610                  XS, VEX, VEX_L;
3611}
3612}
3613
3614let SchedRW = [WriteMove] in {
3615let neverHasSideEffects = 1 in
3616def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3617                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>;
3618
3619def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3620                   "movdqu\t{$src, $dst|$dst, $src}",
3621                   [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
3622
3623// For Disassembler
3624let isCodeGenOnly = 1, hasSideEffects = 0 in {
3625def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3626                       "movdqa\t{$src, $dst|$dst, $src}", [],
3627                       IIC_SSE_MOVA_P_RR>;
3628
3629def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3630                       "movdqu\t{$src, $dst|$dst, $src}",
3631                       [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
3632}
3633} // SchedRW
3634
3635let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3636    neverHasSideEffects = 1, SchedRW = [WriteLoad] in {
3637def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3638                   "movdqa\t{$src, $dst|$dst, $src}",
3639                   [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/],
3640                   IIC_SSE_MOVA_P_RM>;
3641def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3642                   "movdqu\t{$src, $dst|$dst, $src}",
3643                   [/*(set VR128:$dst, (loadv2i64 addr:$src))*/],
3644                   IIC_SSE_MOVU_P_RM>,
3645                 XS, Requires<[UseSSE2]>;
3646}
3647
3648let mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in {
3649def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3650                   "movdqa\t{$src, $dst|$dst, $src}",
3651                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/],
3652                   IIC_SSE_MOVA_P_MR>;
3653def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3654                   "movdqu\t{$src, $dst|$dst, $src}",
3655                   [/*(store (v2i64 VR128:$src), addr:$dst)*/],
3656                   IIC_SSE_MOVU_P_MR>,
3657                 XS, Requires<[UseSSE2]>;
3658}
3659
3660} // ExeDomain = SSEPackedInt
3661
3662let Predicates = [HasAVX] in {
3663  def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
3664            (VMOVDQUmr addr:$dst, VR128:$src)>;
3665  def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src),
3666            (VMOVDQUYmr addr:$dst, VR256:$src)>;
3667}
3668let Predicates = [UseSSE2] in
3669def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
3670          (MOVDQUmr addr:$dst, VR128:$src)>;
3671
3672//===---------------------------------------------------------------------===//
3673// SSE2 - Packed Integer Arithmetic Instructions
3674//===---------------------------------------------------------------------===//
3675
3676let Sched = WriteVecIMul in
3677def SSE_PMADD : OpndItins<
3678  IIC_SSE_PMADD, IIC_SSE_PMADD
3679>;
3680
3681let ExeDomain = SSEPackedInt in { // SSE integer instructions
3682
3683multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
3684                            RegisterClass RC, PatFrag memop_frag,
3685                            X86MemOperand x86memop,
3686                            OpndItins itins,
3687                            bit IsCommutable = 0,
3688                            bit Is2Addr = 1> {
3689  let isCommutable = IsCommutable in
3690  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3691       (ins RC:$src1, RC:$src2),
3692       !if(Is2Addr,
3693           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3694           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3695       [(set RC:$dst, (IntId RC:$src1, RC:$src2))], itins.rr>,
3696      Sched<[itins.Sched]>;
3697  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3698       (ins RC:$src1, x86memop:$src2),
3699       !if(Is2Addr,
3700           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3701           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3702       [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2))))],
3703       itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
3704}
3705
3706multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
3707                             Intrinsic IntId256, OpndItins itins,
3708                             bit IsCommutable = 0> {
3709let Predicates = [HasAVX] in
3710  defm V#NAME : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId128,
3711                                 VR128, loadv2i64, i128mem, itins,
3712                                 IsCommutable, 0>, VEX_4V;
3713
3714let Constraints = "$src1 = $dst" in
3715  defm NAME : PDI_binop_rm_int<opc, OpcodeStr, IntId128, VR128, memopv2i64,
3716                               i128mem, itins, IsCommutable, 1>;
3717
3718let Predicates = [HasAVX2] in
3719  defm V#NAME#Y : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId256,
3720                                   VR256, loadv4i64, i256mem, itins,
3721                                   IsCommutable, 0>, VEX_4V, VEX_L;
3722}
3723
3724multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
3725                         string OpcodeStr, SDNode OpNode,
3726                         SDNode OpNode2, RegisterClass RC,
3727                         ValueType DstVT, ValueType SrcVT, PatFrag bc_frag,
3728                         ShiftOpndItins itins,
3729                         bit Is2Addr = 1> {
3730  // src2 is always 128-bit
3731  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3732       (ins RC:$src1, VR128:$src2),
3733       !if(Is2Addr,
3734           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3735           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3736       [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))],
3737        itins.rr>, Sched<[WriteVecShift]>;
3738  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3739       (ins RC:$src1, i128mem:$src2),
3740       !if(Is2Addr,
3741           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3742           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3743       [(set RC:$dst, (DstVT (OpNode RC:$src1,
3744                       (bc_frag (memopv2i64 addr:$src2)))))], itins.rm>,
3745      Sched<[WriteVecShiftLd, ReadAfterLd]>;
3746  def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
3747       (ins RC:$src1, i8imm:$src2),
3748       !if(Is2Addr,
3749           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3750           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3751       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))], itins.ri>,
3752       Sched<[WriteVecShift]>;
3753}
3754
3755/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
3756multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
3757                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
3758                         PatFrag memop_frag, X86MemOperand x86memop,
3759                         OpndItins itins,
3760                         bit IsCommutable = 0, bit Is2Addr = 1> {
3761  let isCommutable = IsCommutable in
3762  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3763       (ins RC:$src1, RC:$src2),
3764       !if(Is2Addr,
3765           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3766           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3767       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
3768       Sched<[itins.Sched]>;
3769  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3770       (ins RC:$src1, x86memop:$src2),
3771       !if(Is2Addr,
3772           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3773           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3774       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
3775                                     (bitconvert (memop_frag addr:$src2)))))]>,
3776       Sched<[itins.Sched.Folded, ReadAfterLd]>;
3777}
3778} // ExeDomain = SSEPackedInt
3779
3780defm PADDB   : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
3781                             SSE_INTALU_ITINS_P, 1>;
3782defm PADDW   : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
3783                             SSE_INTALU_ITINS_P, 1>;
3784defm PADDD   : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
3785                             SSE_INTALU_ITINS_P, 1>;
3786defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
3787                             SSE_INTALUQ_ITINS_P, 1>;
3788defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
3789                             SSE_INTMUL_ITINS_P, 1>;
3790defm PSUBB   : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
3791                             SSE_INTALU_ITINS_P, 0>;
3792defm PSUBW   : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
3793                             SSE_INTALU_ITINS_P, 0>;
3794defm PSUBD   : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
3795                             SSE_INTALU_ITINS_P, 0>;
3796defm PSUBQ   : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
3797                             SSE_INTALUQ_ITINS_P, 0>;
3798defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
3799                             SSE_INTALU_ITINS_P, 0>;
3800defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
3801                             SSE_INTALU_ITINS_P, 0>;
3802defm PMINUB  : PDI_binop_all<0xDA, "pminub", X86umin, v16i8, v32i8,
3803                             SSE_INTALU_ITINS_P, 1>;
3804defm PMINSW  : PDI_binop_all<0xEA, "pminsw", X86smin, v8i16, v16i16,
3805                             SSE_INTALU_ITINS_P, 1>;
3806defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", X86umax, v16i8, v32i8,
3807                             SSE_INTALU_ITINS_P, 1>;
3808defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", X86smax, v8i16, v16i16,
3809                             SSE_INTALU_ITINS_P, 1>;
3810
3811// Intrinsic forms
3812defm PSUBSB  : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b,
3813                                 int_x86_avx2_psubs_b, SSE_INTALU_ITINS_P, 0>;
3814defm PSUBSW  : PDI_binop_all_int<0xE9, "psubsw" , int_x86_sse2_psubs_w,
3815                                 int_x86_avx2_psubs_w, SSE_INTALU_ITINS_P, 0>;
3816defm PADDSB  : PDI_binop_all_int<0xEC, "paddsb" , int_x86_sse2_padds_b,
3817                                 int_x86_avx2_padds_b, SSE_INTALU_ITINS_P, 1>;
3818defm PADDSW  : PDI_binop_all_int<0xED, "paddsw" , int_x86_sse2_padds_w,
3819                                 int_x86_avx2_padds_w, SSE_INTALU_ITINS_P, 1>;
3820defm PADDUSB : PDI_binop_all_int<0xDC, "paddusb", int_x86_sse2_paddus_b,
3821                                 int_x86_avx2_paddus_b, SSE_INTALU_ITINS_P, 1>;
3822defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w,
3823                                 int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>;
3824defm PMULHUW : PDI_binop_all_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w,
3825                                 int_x86_avx2_pmulhu_w, SSE_INTMUL_ITINS_P, 1>;
3826defm PMULHW  : PDI_binop_all_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w,
3827                                 int_x86_avx2_pmulh_w, SSE_INTMUL_ITINS_P, 1>;
3828defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
3829                                 int_x86_avx2_pmadd_wd, SSE_PMADD, 1>;
3830defm PAVGB   : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b,
3831                                 int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>;
3832defm PAVGW   : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w,
3833                                 int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>;
3834defm PSADBW  : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw,
3835                                 int_x86_avx2_psad_bw, SSE_PMADD, 1>;
3836
3837let Predicates = [HasAVX] in
3838defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
3839                              loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
3840                              VEX_4V;
3841let Predicates = [HasAVX2] in
3842defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
3843                               VR256, loadv4i64, i256mem,
3844                               SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
3845let Constraints = "$src1 = $dst" in
3846defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
3847                             memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>;
3848
3849//===---------------------------------------------------------------------===//
3850// SSE2 - Packed Integer Logical Instructions
3851//===---------------------------------------------------------------------===//
3852
3853let Predicates = [HasAVX] in {
3854defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
3855                            VR128, v8i16, v8i16, bc_v8i16,
3856                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
3857defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
3858                            VR128, v4i32, v4i32, bc_v4i32,
3859                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
3860defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
3861                            VR128, v2i64, v2i64, bc_v2i64,
3862                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
3863
3864defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
3865                            VR128, v8i16, v8i16, bc_v8i16,
3866                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
3867defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
3868                            VR128, v4i32, v4i32, bc_v4i32,
3869                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
3870defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
3871                            VR128, v2i64, v2i64, bc_v2i64,
3872                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
3873
3874defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
3875                            VR128, v8i16, v8i16, bc_v8i16,
3876                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
3877defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
3878                            VR128, v4i32, v4i32, bc_v4i32,
3879                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
3880
3881let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
3882  // 128-bit logical shifts.
3883  def VPSLLDQri : PDIi8<0x73, MRM7r,
3884                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
3885                    "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3886                    [(set VR128:$dst,
3887                      (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>,
3888                    VEX_4V;
3889  def VPSRLDQri : PDIi8<0x73, MRM3r,
3890                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
3891                    "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3892                    [(set VR128:$dst,
3893                      (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>,
3894                    VEX_4V;
3895  // PSRADQri doesn't exist in SSE[1-3].
3896}
3897} // Predicates = [HasAVX]
3898
3899let Predicates = [HasAVX2] in {
3900defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
3901                             VR256, v16i16, v8i16, bc_v8i16,
3902                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
3903defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
3904                             VR256, v8i32, v4i32, bc_v4i32,
3905                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
3906defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
3907                             VR256, v4i64, v2i64, bc_v2i64,
3908                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
3909
3910defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
3911                             VR256, v16i16, v8i16, bc_v8i16,
3912                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
3913defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
3914                             VR256, v8i32, v4i32, bc_v4i32,
3915                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
3916defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
3917                             VR256, v4i64, v2i64, bc_v2i64,
3918                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
3919
3920defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
3921                             VR256, v16i16, v8i16, bc_v8i16,
3922                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
3923defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
3924                             VR256, v8i32, v4i32, bc_v4i32,
3925                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
3926
3927let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
3928  // 256-bit logical shifts.
3929  def VPSLLDQYri : PDIi8<0x73, MRM7r,
3930                    (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2),
3931                    "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3932                    [(set VR256:$dst,
3933                      (int_x86_avx2_psll_dq_bs VR256:$src1, imm:$src2))]>,
3934                    VEX_4V, VEX_L;
3935  def VPSRLDQYri : PDIi8<0x73, MRM3r,
3936                    (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2),
3937                    "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3938                    [(set VR256:$dst,
3939                      (int_x86_avx2_psrl_dq_bs VR256:$src1, imm:$src2))]>,
3940                    VEX_4V, VEX_L;
3941  // PSRADQYri doesn't exist in SSE[1-3].
3942}
3943} // Predicates = [HasAVX2]
3944
3945let Constraints = "$src1 = $dst" in {
3946defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
3947                           VR128, v8i16, v8i16, bc_v8i16,
3948                           SSE_INTSHIFT_ITINS_P>;
3949defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
3950                           VR128, v4i32, v4i32, bc_v4i32,
3951                           SSE_INTSHIFT_ITINS_P>;
3952defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
3953                           VR128, v2i64, v2i64, bc_v2i64,
3954                           SSE_INTSHIFT_ITINS_P>;
3955
3956defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
3957                           VR128, v8i16, v8i16, bc_v8i16,
3958                           SSE_INTSHIFT_ITINS_P>;
3959defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
3960                           VR128, v4i32, v4i32, bc_v4i32,
3961                           SSE_INTSHIFT_ITINS_P>;
3962defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
3963                           VR128, v2i64, v2i64, bc_v2i64,
3964                           SSE_INTSHIFT_ITINS_P>;
3965
3966defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
3967                           VR128, v8i16, v8i16, bc_v8i16,
3968                           SSE_INTSHIFT_ITINS_P>;
3969defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
3970                           VR128, v4i32, v4i32, bc_v4i32,
3971                           SSE_INTSHIFT_ITINS_P>;
3972
3973let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
3974  // 128-bit logical shifts.
3975  def PSLLDQri : PDIi8<0x73, MRM7r,
3976                       (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
3977                       "pslldq\t{$src2, $dst|$dst, $src2}",
3978                       [(set VR128:$dst,
3979                         (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))],
3980                         IIC_SSE_INTSHDQ_P_RI>;
3981  def PSRLDQri : PDIi8<0x73, MRM3r,
3982                       (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
3983                       "psrldq\t{$src2, $dst|$dst, $src2}",
3984                       [(set VR128:$dst,
3985                         (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))],
3986                         IIC_SSE_INTSHDQ_P_RI>;
3987  // PSRADQri doesn't exist in SSE[1-3].
3988}
3989} // Constraints = "$src1 = $dst"
3990
3991let Predicates = [HasAVX] in {
3992  def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
3993            (VPSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
3994  def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
3995            (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
3996  def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
3997            (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
3998
3999  // Shift up / down and insert zero's.
4000  def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))),
4001            (VPSLLDQri VR128:$src, (BYTE_imm imm:$amt))>;
4002  def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))),
4003            (VPSRLDQri VR128:$src, (BYTE_imm imm:$amt))>;
4004}
4005
4006let Predicates = [HasAVX2] in {
4007  def : Pat<(int_x86_avx2_psll_dq VR256:$src1, imm:$src2),
4008            (VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
4009  def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2),
4010            (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
4011}
4012
4013let Predicates = [UseSSE2] in {
4014  def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
4015            (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
4016  def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
4017            (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
4018  def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
4019            (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
4020
4021  // Shift up / down and insert zero's.
4022  def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))),
4023            (PSLLDQri VR128:$src, (BYTE_imm imm:$amt))>;
4024  def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))),
4025            (PSRLDQri VR128:$src, (BYTE_imm imm:$amt))>;
4026}
4027
4028//===---------------------------------------------------------------------===//
4029// SSE2 - Packed Integer Comparison Instructions
4030//===---------------------------------------------------------------------===//
4031
4032defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
4033                             SSE_INTALU_ITINS_P, 1>;
4034defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
4035                             SSE_INTALU_ITINS_P, 1>;
4036defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
4037                             SSE_INTALU_ITINS_P, 1>;
4038defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
4039                             SSE_INTALU_ITINS_P, 0>;
4040defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
4041                             SSE_INTALU_ITINS_P, 0>;
4042defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
4043                             SSE_INTALU_ITINS_P, 0>;
4044
4045//===---------------------------------------------------------------------===//
4046// SSE2 - Packed Integer Pack Instructions
4047//===---------------------------------------------------------------------===//
4048
4049defm PACKSSWB : PDI_binop_all_int<0x63, "packsswb", int_x86_sse2_packsswb_128,
4050                                  int_x86_avx2_packsswb, SSE_INTALU_ITINS_P, 0>;
4051defm PACKSSDW : PDI_binop_all_int<0x6B, "packssdw", int_x86_sse2_packssdw_128,
4052                                  int_x86_avx2_packssdw, SSE_INTALU_ITINS_P, 0>;
4053defm PACKUSWB : PDI_binop_all_int<0x67, "packuswb", int_x86_sse2_packuswb_128,
4054                                  int_x86_avx2_packuswb, SSE_INTALU_ITINS_P, 0>;
4055
4056//===---------------------------------------------------------------------===//
4057// SSE2 - Packed Integer Shuffle Instructions
4058//===---------------------------------------------------------------------===//
4059
4060let ExeDomain = SSEPackedInt in {
4061multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
4062                         SDNode OpNode> {
4063let Predicates = [HasAVX] in {
4064  def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
4065                      (ins VR128:$src1, i8imm:$src2),
4066                      !strconcat("v", OpcodeStr,
4067                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4068                      [(set VR128:$dst,
4069                        (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
4070                      IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>;
4071  def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
4072                      (ins i128mem:$src1, i8imm:$src2),
4073                      !strconcat("v", OpcodeStr,
4074                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4075                     [(set VR128:$dst,
4076                       (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
4077                        (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX,
4078                  Sched<[WriteShuffleLd]>;
4079}
4080
4081let Predicates = [HasAVX2] in {
4082  def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
4083                       (ins VR256:$src1, i8imm:$src2),
4084                       !strconcat("v", OpcodeStr,
4085                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4086                       [(set VR256:$dst,
4087                         (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
4088                       IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>;
4089  def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
4090                       (ins i256mem:$src1, i8imm:$src2),
4091                       !strconcat("v", OpcodeStr,
4092                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4093                      [(set VR256:$dst,
4094                        (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
4095                         (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L,
4096                   Sched<[WriteShuffleLd]>;
4097}
4098
4099let Predicates = [UseSSE2] in {
4100  def ri : Ii8<0x70, MRMSrcReg,
4101               (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
4102               !strconcat(OpcodeStr,
4103                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4104                [(set VR128:$dst,
4105                  (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
4106                IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>;
4107  def mi : Ii8<0x70, MRMSrcMem,
4108               (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
4109               !strconcat(OpcodeStr,
4110                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4111                [(set VR128:$dst,
4112                  (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
4113                          (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>,
4114           Sched<[WriteShuffleLd]>;
4115}
4116}
4117} // ExeDomain = SSEPackedInt
4118
4119defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd>, TB, OpSize;
4120defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw>, XS;
4121defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw>, XD;
4122
4123let Predicates = [HasAVX] in {
4124  def : Pat<(v4f32 (X86PShufd (loadv4f32 addr:$src1), (i8 imm:$imm))),
4125            (VPSHUFDmi addr:$src1, imm:$imm)>;
4126  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
4127            (VPSHUFDri VR128:$src1, imm:$imm)>;
4128}
4129
4130let Predicates = [UseSSE2] in {
4131  def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
4132            (PSHUFDmi addr:$src1, imm:$imm)>;
4133  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
4134            (PSHUFDri VR128:$src1, imm:$imm)>;
4135}
4136
4137//===---------------------------------------------------------------------===//
4138// SSE2 - Packed Integer Unpack Instructions
4139//===---------------------------------------------------------------------===//
4140
4141let ExeDomain = SSEPackedInt in {
4142multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
4143                       SDNode OpNode, PatFrag bc_frag, bit Is2Addr = 1> {
4144  def rr : PDI<opc, MRMSrcReg,
4145      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
4146      !if(Is2Addr,
4147          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
4148          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4149      [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))],
4150      IIC_SSE_UNPCK>, Sched<[WriteShuffle]>;
4151  def rm : PDI<opc, MRMSrcMem,
4152      (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
4153      !if(Is2Addr,
4154          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
4155          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4156      [(set VR128:$dst, (OpNode VR128:$src1,
4157                                  (bc_frag (memopv2i64
4158                                               addr:$src2))))],
4159                                               IIC_SSE_UNPCK>,
4160      Sched<[WriteShuffleLd, ReadAfterLd]>;
4161}
4162
4163multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
4164                         SDNode OpNode, PatFrag bc_frag> {
4165  def Yrr : PDI<opc, MRMSrcReg,
4166      (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
4167      !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4168      [(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>,
4169      Sched<[WriteShuffle]>;
4170  def Yrm : PDI<opc, MRMSrcMem,
4171      (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
4172      !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4173      [(set VR256:$dst, (OpNode VR256:$src1,
4174                                  (bc_frag (memopv4i64 addr:$src2))))]>,
4175      Sched<[WriteShuffleLd, ReadAfterLd]>;
4176}
4177
4178let Predicates = [HasAVX] in {
4179  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
4180                                 bc_v16i8, 0>, VEX_4V;
4181  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
4182                                 bc_v8i16, 0>, VEX_4V;
4183  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
4184                                 bc_v4i32, 0>, VEX_4V;
4185  defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
4186                                 bc_v2i64, 0>, VEX_4V;
4187
4188  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
4189                                 bc_v16i8, 0>, VEX_4V;
4190  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
4191                                 bc_v8i16, 0>, VEX_4V;
4192  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
4193                                 bc_v4i32, 0>, VEX_4V;
4194  defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
4195                                 bc_v2i64, 0>, VEX_4V;
4196}
4197
4198let Predicates = [HasAVX2] in {
4199  defm VPUNPCKLBW  : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl,
4200                                   bc_v32i8>, VEX_4V, VEX_L;
4201  defm VPUNPCKLWD  : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl,
4202                                   bc_v16i16>, VEX_4V, VEX_L;
4203  defm VPUNPCKLDQ  : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl,
4204                                   bc_v8i32>, VEX_4V, VEX_L;
4205  defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl,
4206                                   bc_v4i64>, VEX_4V, VEX_L;
4207
4208  defm VPUNPCKHBW  : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh,
4209                                   bc_v32i8>, VEX_4V, VEX_L;
4210  defm VPUNPCKHWD  : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh,
4211                                   bc_v16i16>, VEX_4V, VEX_L;
4212  defm VPUNPCKHDQ  : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh,
4213                                   bc_v8i32>, VEX_4V, VEX_L;
4214  defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh,
4215                                   bc_v4i64>, VEX_4V, VEX_L;
4216}
4217
4218let Constraints = "$src1 = $dst" in {
4219  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl,
4220                                bc_v16i8>;
4221  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl,
4222                                bc_v8i16>;
4223  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl,
4224                                bc_v4i32>;
4225  defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl,
4226                                bc_v2i64>;
4227
4228  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh,
4229                                bc_v16i8>;
4230  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh,
4231                                bc_v8i16>;
4232  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh,
4233                                bc_v4i32>;
4234  defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh,
4235                                bc_v2i64>;
4236}
4237} // ExeDomain = SSEPackedInt
4238
4239//===---------------------------------------------------------------------===//
4240// SSE2 - Packed Integer Extract and Insert
4241//===---------------------------------------------------------------------===//
4242
4243let ExeDomain = SSEPackedInt in {
4244multiclass sse2_pinsrw<bit Is2Addr = 1> {
4245  def rri : Ii8<0xC4, MRMSrcReg,
4246       (outs VR128:$dst), (ins VR128:$src1,
4247        GR32orGR64:$src2, i32i8imm:$src3),
4248       !if(Is2Addr,
4249           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
4250           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4251       [(set VR128:$dst,
4252         (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))],
4253       IIC_SSE_PINSRW>, Sched<[WriteShuffle]>;
4254  def rmi : Ii8<0xC4, MRMSrcMem,
4255                       (outs VR128:$dst), (ins VR128:$src1,
4256                        i16mem:$src2, i32i8imm:$src3),
4257       !if(Is2Addr,
4258           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
4259           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4260       [(set VR128:$dst,
4261         (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
4262                    imm:$src3))], IIC_SSE_PINSRW>,
4263       Sched<[WriteShuffleLd, ReadAfterLd]>;
4264}
4265
4266// Extract
4267let Predicates = [HasAVX] in
4268def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
4269                    (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2),
4270                    "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4271                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
4272                                            imm:$src2))]>, TB, OpSize, VEX,
4273                Sched<[WriteShuffle]>;
4274def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
4275                    (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2),
4276                    "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4277                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
4278                                            imm:$src2))], IIC_SSE_PEXTRW>,
4279               Sched<[WriteShuffleLd, ReadAfterLd]>;
4280
4281// Insert
4282let Predicates = [HasAVX] in
4283defm VPINSRW : sse2_pinsrw<0>, TB, OpSize, VEX_4V;
4284
4285let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
4286defm PINSRW : sse2_pinsrw, TB, OpSize;
4287
4288} // ExeDomain = SSEPackedInt
4289
4290//===---------------------------------------------------------------------===//
4291// SSE2 - Packed Mask Creation
4292//===---------------------------------------------------------------------===//
4293
4294let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
4295
4296def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4297           (ins VR128:$src),
4298           "pmovmskb\t{$src, $dst|$dst, $src}",
4299           [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
4300           IIC_SSE_MOVMSK>, VEX;
4301
4302let Predicates = [HasAVX2] in {
4303def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4304           (ins VR256:$src),
4305           "pmovmskb\t{$src, $dst|$dst, $src}",
4306           [(set GR32orGR64:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>,
4307           VEX, VEX_L;
4308}
4309
4310def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
4311           "pmovmskb\t{$src, $dst|$dst, $src}",
4312           [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
4313           IIC_SSE_MOVMSK>;
4314
4315} // ExeDomain = SSEPackedInt
4316
4317//===---------------------------------------------------------------------===//
4318// SSE2 - Conditional Store
4319//===---------------------------------------------------------------------===//
4320
4321let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
4322
4323let Uses = [EDI], Predicates = [HasAVX,In32BitMode] in
4324def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
4325           (ins VR128:$src, VR128:$mask),
4326           "maskmovdqu\t{$mask, $src|$src, $mask}",
4327           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
4328           IIC_SSE_MASKMOV>, VEX;
4329let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
4330def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
4331           (ins VR128:$src, VR128:$mask),
4332           "maskmovdqu\t{$mask, $src|$src, $mask}",
4333           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
4334           IIC_SSE_MASKMOV>, VEX;
4335
4336let Uses = [EDI], Predicates = [UseSSE2,In32BitMode] in
4337def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4338           "maskmovdqu\t{$mask, $src|$src, $mask}",
4339           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
4340           IIC_SSE_MASKMOV>;
4341let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
4342def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4343           "maskmovdqu\t{$mask, $src|$src, $mask}",
4344           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
4345           IIC_SSE_MASKMOV>;
4346
4347} // ExeDomain = SSEPackedInt
4348
4349//===---------------------------------------------------------------------===//
4350// SSE2 - Move Doubleword
4351//===---------------------------------------------------------------------===//
4352
4353//===---------------------------------------------------------------------===//
4354// Move Int Doubleword to Packed Double Int
4355//
4356def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4357                      "movd\t{$src, $dst|$dst, $src}",
4358                      [(set VR128:$dst,
4359                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
4360                        VEX, Sched<[WriteMove]>;
4361def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4362                      "movd\t{$src, $dst|$dst, $src}",
4363                      [(set VR128:$dst,
4364                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
4365                        IIC_SSE_MOVDQ>,
4366                      VEX, Sched<[WriteLoad]>;
4367def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4368                        "movq\t{$src, $dst|$dst, $src}",
4369                        [(set VR128:$dst,
4370                          (v2i64 (scalar_to_vector GR64:$src)))],
4371                          IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
4372let isCodeGenOnly = 1 in
4373def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4374                       "movq\t{$src, $dst|$dst, $src}",
4375                       [(set FR64:$dst, (bitconvert GR64:$src))],
4376                       IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
4377
4378def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4379                      "movd\t{$src, $dst|$dst, $src}",
4380                      [(set VR128:$dst,
4381                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
4382                  Sched<[WriteMove]>;
4383def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4384                      "movd\t{$src, $dst|$dst, $src}",
4385                      [(set VR128:$dst,
4386                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
4387                        IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
4388def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4389                        "mov{d|q}\t{$src, $dst|$dst, $src}",
4390                        [(set VR128:$dst,
4391                          (v2i64 (scalar_to_vector GR64:$src)))],
4392                          IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
4393let isCodeGenOnly = 1 in
4394def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4395                       "mov{d|q}\t{$src, $dst|$dst, $src}",
4396                       [(set FR64:$dst, (bitconvert GR64:$src))],
4397                       IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
4398
4399//===---------------------------------------------------------------------===//
4400// Move Int Doubleword to Single Scalar
4401//
4402let isCodeGenOnly = 1 in {
4403  def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4404                        "movd\t{$src, $dst|$dst, $src}",
4405                        [(set FR32:$dst, (bitconvert GR32:$src))],
4406                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
4407
4408  def VMOVDI2SSrm  : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
4409                        "movd\t{$src, $dst|$dst, $src}",
4410                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
4411                        IIC_SSE_MOVDQ>,
4412                        VEX, Sched<[WriteLoad]>;
4413  def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4414                        "movd\t{$src, $dst|$dst, $src}",
4415                        [(set FR32:$dst, (bitconvert GR32:$src))],
4416                        IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
4417
4418  def MOVDI2SSrm  : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
4419                        "movd\t{$src, $dst|$dst, $src}",
4420                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
4421                        IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
4422}
4423
4424//===---------------------------------------------------------------------===//
4425// Move Packed Doubleword Int to Packed Double Int
4426//
4427def VMOVPDI2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4428                       "movd\t{$src, $dst|$dst, $src}",
4429                       [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
4430                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX,
4431                    Sched<[WriteMove]>;
4432def VMOVPDI2DImr  : VS2I<0x7E, MRMDestMem, (outs),
4433                       (ins i32mem:$dst, VR128:$src),
4434                       "movd\t{$src, $dst|$dst, $src}",
4435                       [(store (i32 (vector_extract (v4i32 VR128:$src),
4436                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
4437                                     VEX, Sched<[WriteLoad]>;
4438def MOVPDI2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4439                       "movd\t{$src, $dst|$dst, $src}",
4440                       [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
4441                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
4442                   Sched<[WriteMove]>;
4443def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
4444                       "movd\t{$src, $dst|$dst, $src}",
4445                       [(store (i32 (vector_extract (v4i32 VR128:$src),
4446                                     (iPTR 0))), addr:$dst)],
4447                                     IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
4448
4449def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))),
4450        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
4451
4452def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
4453        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
4454
4455def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
4456        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
4457
4458def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
4459        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
4460
4461//===---------------------------------------------------------------------===//
4462// Move Packed Doubleword Int first element to Doubleword Int
4463//
4464let SchedRW = [WriteMove] in {
4465def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4466                          "movq\t{$src, $dst|$dst, $src}",
4467                          [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
4468                                                           (iPTR 0)))],
4469                                                           IIC_SSE_MOVD_ToGP>,
4470                      VEX;
4471
4472def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4473                        "mov{d|q}\t{$src, $dst|$dst, $src}",
4474                        [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
4475                                                         (iPTR 0)))],
4476                                                         IIC_SSE_MOVD_ToGP>;
4477} //SchedRW
4478
4479//===---------------------------------------------------------------------===//
4480// Bitcast FR64 <-> GR64
4481//
4482let isCodeGenOnly = 1 in {
4483  let Predicates = [UseAVX] in
4484  def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
4485                          "movq\t{$src, $dst|$dst, $src}",
4486                          [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
4487                          VEX, Sched<[WriteLoad]>;
4488  def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4489                           "movq\t{$src, $dst|$dst, $src}",
4490                           [(set GR64:$dst, (bitconvert FR64:$src))],
4491                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
4492  def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
4493                           "movq\t{$src, $dst|$dst, $src}",
4494                           [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
4495                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
4496
4497  def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
4498                         "movq\t{$src, $dst|$dst, $src}",
4499                         [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
4500                         IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
4501  def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4502                         "mov{d|q}\t{$src, $dst|$dst, $src}",
4503                         [(set GR64:$dst, (bitconvert FR64:$src))],
4504                         IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
4505  def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
4506                         "movq\t{$src, $dst|$dst, $src}",
4507                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
4508                         IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
4509}
4510
4511//===---------------------------------------------------------------------===//
4512// Move Scalar Single to Double Int
4513//
4514let isCodeGenOnly = 1 in {
4515  def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4516                        "movd\t{$src, $dst|$dst, $src}",
4517                        [(set GR32:$dst, (bitconvert FR32:$src))],
4518                        IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
4519  def VMOVSS2DImr  : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
4520                        "movd\t{$src, $dst|$dst, $src}",
4521                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
4522                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
4523  def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4524                        "movd\t{$src, $dst|$dst, $src}",
4525                        [(set GR32:$dst, (bitconvert FR32:$src))],
4526                        IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
4527  def MOVSS2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
4528                        "movd\t{$src, $dst|$dst, $src}",
4529                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
4530                        IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
4531}
4532
4533//===---------------------------------------------------------------------===//
4534// Patterns and instructions to describe movd/movq to XMM register zero-extends
4535//
4536let isCodeGenOnly = 1, SchedRW = [WriteMove] in {
4537let AddedComplexity = 15 in {
4538def VMOVZQI2PQIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4539                       "movq\t{$src, $dst|$dst, $src}", // X86-64 only
4540                       [(set VR128:$dst, (v2i64 (X86vzmovl
4541                                      (v2i64 (scalar_to_vector GR64:$src)))))],
4542                                      IIC_SSE_MOVDQ>,
4543                                      VEX, VEX_W;
4544def MOVZQI2PQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4545                       "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
4546                       [(set VR128:$dst, (v2i64 (X86vzmovl
4547                                      (v2i64 (scalar_to_vector GR64:$src)))))],
4548                                      IIC_SSE_MOVDQ>;
4549}
4550} // isCodeGenOnly, SchedRW
4551
4552let Predicates = [UseAVX] in {
4553  let AddedComplexity = 15 in
4554    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4555              (VMOVDI2PDIrr GR32:$src)>;
4556
4557  // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
4558  let AddedComplexity = 20 in {
4559    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
4560              (VMOVDI2PDIrm addr:$src)>;
4561    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
4562              (VMOVDI2PDIrm addr:$src)>;
4563    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
4564              (VMOVDI2PDIrm addr:$src)>;
4565  }
4566  // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
4567  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
4568                               (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
4569            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>;
4570  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
4571                               (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
4572            (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>;
4573}
4574
4575let Predicates = [UseSSE2] in {
4576  let AddedComplexity = 15 in
4577    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4578              (MOVDI2PDIrr GR32:$src)>;
4579
4580  let AddedComplexity = 20 in {
4581    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
4582              (MOVDI2PDIrm addr:$src)>;
4583    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
4584              (MOVDI2PDIrm addr:$src)>;
4585    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
4586              (MOVDI2PDIrm addr:$src)>;
4587  }
4588}
4589
4590// These are the correct encodings of the instructions so that we know how to
4591// read correct assembly, even though we continue to emit the wrong ones for
4592// compatibility with Darwin's buggy assembler.
4593def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
4594                (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4595def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
4596                (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4597// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
4598def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4599                (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4600def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4601                (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4602
4603//===---------------------------------------------------------------------===//
4604// SSE2 - Move Quadword
4605//===---------------------------------------------------------------------===//
4606
4607//===---------------------------------------------------------------------===//
4608// Move Quadword Int to Packed Quadword Int
4609//
4610
4611let SchedRW = [WriteLoad] in {
4612def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4613                    "vmovq\t{$src, $dst|$dst, $src}",
4614                    [(set VR128:$dst,
4615                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4616                    VEX, Requires<[UseAVX]>;
4617def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4618                    "movq\t{$src, $dst|$dst, $src}",
4619                    [(set VR128:$dst,
4620                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))],
4621                      IIC_SSE_MOVDQ>, XS,
4622                    Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
4623} // SchedRW
4624
4625//===---------------------------------------------------------------------===//
4626// Move Packed Quadword Int to Quadword Int
4627//
4628let SchedRW = [WriteStore] in {
4629def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4630                      "movq\t{$src, $dst|$dst, $src}",
4631                      [(store (i64 (vector_extract (v2i64 VR128:$src),
4632                                    (iPTR 0))), addr:$dst)],
4633                                    IIC_SSE_MOVDQ>, VEX;
4634def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4635                      "movq\t{$src, $dst|$dst, $src}",
4636                      [(store (i64 (vector_extract (v2i64 VR128:$src),
4637                                    (iPTR 0))), addr:$dst)],
4638                                    IIC_SSE_MOVDQ>;
4639} // SchedRW
4640
4641//===---------------------------------------------------------------------===//
4642// Store / copy lower 64-bits of a XMM register.
4643//
4644def VMOVLQ128mr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4645                     "movq\t{$src, $dst|$dst, $src}",
4646                     [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX,
4647                  Sched<[WriteStore]>;
4648def MOVLQ128mr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4649                     "movq\t{$src, $dst|$dst, $src}",
4650                     [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)],
4651                     IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
4652
4653let isCodeGenOnly = 1, AddedComplexity = 20 in {
4654def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4655                     "vmovq\t{$src, $dst|$dst, $src}",
4656                     [(set VR128:$dst,
4657                       (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
4658                                                 (loadi64 addr:$src))))))],
4659                                                 IIC_SSE_MOVDQ>,
4660                     XS, VEX, Requires<[UseAVX]>, Sched<[WriteLoad]>;
4661
4662def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4663                     "movq\t{$src, $dst|$dst, $src}",
4664                     [(set VR128:$dst,
4665                       (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
4666                                                 (loadi64 addr:$src))))))],
4667                                                 IIC_SSE_MOVDQ>,
4668                     XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>;
4669}
4670
4671let Predicates = [UseAVX], AddedComplexity = 20 in {
4672  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
4673            (VMOVZQI2PQIrm addr:$src)>;
4674  def : Pat<(v2i64 (X86vzload addr:$src)),
4675            (VMOVZQI2PQIrm addr:$src)>;
4676}
4677
4678let Predicates = [UseSSE2], AddedComplexity = 20 in {
4679  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
4680            (MOVZQI2PQIrm addr:$src)>;
4681  def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
4682}
4683
4684let Predicates = [HasAVX] in {
4685def : Pat<(v4i64 (alignedX86vzload addr:$src)),
4686          (SUBREG_TO_REG (i32 0), (VMOVAPSrm addr:$src), sub_xmm)>;
4687def : Pat<(v4i64 (X86vzload addr:$src)),
4688          (SUBREG_TO_REG (i32 0), (VMOVUPSrm addr:$src), sub_xmm)>;
4689}
4690
4691//===---------------------------------------------------------------------===//
4692// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
4693// IA32 document. movq xmm1, xmm2 does clear the high bits.
4694//
4695let SchedRW = [WriteVecLogic] in {
4696let AddedComplexity = 15 in
4697def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4698                        "vmovq\t{$src, $dst|$dst, $src}",
4699                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
4700                    IIC_SSE_MOVQ_RR>,
4701                      XS, VEX, Requires<[UseAVX]>;
4702let AddedComplexity = 15 in
4703def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4704                        "movq\t{$src, $dst|$dst, $src}",
4705                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
4706                    IIC_SSE_MOVQ_RR>,
4707                      XS, Requires<[UseSSE2]>;
4708} // SchedRW
4709
4710let isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in {
4711let AddedComplexity = 20 in
4712def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4713                        "vmovq\t{$src, $dst|$dst, $src}",
4714                    [(set VR128:$dst, (v2i64 (X86vzmovl
4715                                             (loadv2i64 addr:$src))))],
4716                                             IIC_SSE_MOVDQ>,
4717                      XS, VEX, Requires<[UseAVX]>;
4718let AddedComplexity = 20 in {
4719def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4720                        "movq\t{$src, $dst|$dst, $src}",
4721                    [(set VR128:$dst, (v2i64 (X86vzmovl
4722                                             (loadv2i64 addr:$src))))],
4723                                             IIC_SSE_MOVDQ>,
4724                      XS, Requires<[UseSSE2]>;
4725}
4726} // isCodeGenOnly, SchedRW
4727
4728let AddedComplexity = 20 in {
4729  let Predicates = [UseAVX] in {
4730    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4731              (VMOVZPQILo2PQIrr VR128:$src)>;
4732  }
4733  let Predicates = [UseSSE2] in {
4734    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4735              (MOVZPQILo2PQIrr VR128:$src)>;
4736  }
4737}
4738
4739//===---------------------------------------------------------------------===//
4740// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
4741//===---------------------------------------------------------------------===//
4742multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
4743                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
4744                              X86MemOperand x86memop> {
4745def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
4746                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4747                      [(set RC:$dst, (vt (OpNode RC:$src)))],
4748                      IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
4749def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
4750                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4751                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))],
4752                      IIC_SSE_MOV_LH>, Sched<[WriteShuffleLd]>;
4753}
4754
4755let Predicates = [HasAVX] in {
4756  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4757                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
4758  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4759                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
4760  defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4761                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
4762  defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4763                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
4764}
4765defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
4766                                   memopv4f32, f128mem>;
4767defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
4768                                   memopv4f32, f128mem>;
4769
4770let Predicates = [HasAVX] in {
4771  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4772            (VMOVSHDUPrr VR128:$src)>;
4773  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))),
4774            (VMOVSHDUPrm addr:$src)>;
4775  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4776            (VMOVSLDUPrr VR128:$src)>;
4777  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))),
4778            (VMOVSLDUPrm addr:$src)>;
4779  def : Pat<(v8i32 (X86Movshdup VR256:$src)),
4780            (VMOVSHDUPYrr VR256:$src)>;
4781  def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))),
4782            (VMOVSHDUPYrm addr:$src)>;
4783  def : Pat<(v8i32 (X86Movsldup VR256:$src)),
4784            (VMOVSLDUPYrr VR256:$src)>;
4785  def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))),
4786            (VMOVSLDUPYrm addr:$src)>;
4787}
4788
4789let Predicates = [UseSSE3] in {
4790  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4791            (MOVSHDUPrr VR128:$src)>;
4792  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
4793            (MOVSHDUPrm addr:$src)>;
4794  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4795            (MOVSLDUPrr VR128:$src)>;
4796  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
4797            (MOVSLDUPrm addr:$src)>;
4798}
4799
4800//===---------------------------------------------------------------------===//
4801// SSE3 - Replicate Double FP - MOVDDUP
4802//===---------------------------------------------------------------------===//
4803
4804multiclass sse3_replicate_dfp<string OpcodeStr> {
4805let neverHasSideEffects = 1 in
4806def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4807                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4808                    [], IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
4809def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
4810                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4811                    [(set VR128:$dst,
4812                      (v2f64 (X86Movddup
4813                              (scalar_to_vector (loadf64 addr:$src)))))],
4814                              IIC_SSE_MOV_LH>, Sched<[WriteShuffleLd]>;
4815}
4816
4817// FIXME: Merge with above classe when there're patterns for the ymm version
4818multiclass sse3_replicate_dfp_y<string OpcodeStr> {
4819def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
4820                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4821                    [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
4822                    Sched<[WriteShuffle]>;
4823def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
4824                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4825                    [(set VR256:$dst,
4826                      (v4f64 (X86Movddup
4827                              (scalar_to_vector (loadf64 addr:$src)))))]>,
4828                    Sched<[WriteShuffleLd]>;
4829}
4830
4831let Predicates = [HasAVX] in {
4832  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
4833  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L;
4834}
4835
4836defm MOVDDUP : sse3_replicate_dfp<"movddup">;
4837
4838let Predicates = [HasAVX] in {
4839  def : Pat<(X86Movddup (loadv2f64 addr:$src)),
4840            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4841  def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))),
4842            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4843  def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))),
4844            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4845  def : Pat<(X86Movddup (bc_v2f64
4846                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
4847            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4848
4849  // 256-bit version
4850  def : Pat<(X86Movddup (loadv4f64 addr:$src)),
4851            (VMOVDDUPYrm addr:$src)>;
4852  def : Pat<(X86Movddup (loadv4i64 addr:$src)),
4853            (VMOVDDUPYrm addr:$src)>;
4854  def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))),
4855            (VMOVDDUPYrm addr:$src)>;
4856  def : Pat<(X86Movddup (v4i64 VR256:$src)),
4857            (VMOVDDUPYrr VR256:$src)>;
4858}
4859
4860let Predicates = [UseSSE3] in {
4861  def : Pat<(X86Movddup (memopv2f64 addr:$src)),
4862            (MOVDDUPrm addr:$src)>;
4863  def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
4864            (MOVDDUPrm addr:$src)>;
4865  def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
4866            (MOVDDUPrm addr:$src)>;
4867  def : Pat<(X86Movddup (bc_v2f64
4868                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
4869            (MOVDDUPrm addr:$src)>;
4870}
4871
4872//===---------------------------------------------------------------------===//
4873// SSE3 - Move Unaligned Integer
4874//===---------------------------------------------------------------------===//
4875
4876let SchedRW = [WriteLoad] in {
4877let Predicates = [HasAVX] in {
4878  def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4879                   "vlddqu\t{$src, $dst|$dst, $src}",
4880                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
4881  def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
4882                   "vlddqu\t{$src, $dst|$dst, $src}",
4883                   [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
4884                   VEX, VEX_L;
4885}
4886def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4887                   "lddqu\t{$src, $dst|$dst, $src}",
4888                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))],
4889                   IIC_SSE_LDDQU>;
4890}
4891
4892//===---------------------------------------------------------------------===//
4893// SSE3 - Arithmetic
4894//===---------------------------------------------------------------------===//
4895
4896multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
4897                       X86MemOperand x86memop, OpndItins itins,
4898                       bit Is2Addr = 1> {
4899  def rr : I<0xD0, MRMSrcReg,
4900       (outs RC:$dst), (ins RC:$src1, RC:$src2),
4901       !if(Is2Addr,
4902           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4903           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4904       [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>,
4905       Sched<[itins.Sched]>;
4906  def rm : I<0xD0, MRMSrcMem,
4907       (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4908       !if(Is2Addr,
4909           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4910           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4911       [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))], itins.rr>,
4912       Sched<[itins.Sched.Folded, ReadAfterLd]>;
4913}
4914
4915let Predicates = [HasAVX] in {
4916  let ExeDomain = SSEPackedSingle in {
4917    defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
4918                                 f128mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V;
4919    defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
4920                               f256mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V, VEX_L;
4921  }
4922  let ExeDomain = SSEPackedDouble in {
4923    defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
4924                                 f128mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V;
4925    defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
4926                           f256mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V, VEX_L;
4927  }
4928}
4929let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
4930  let ExeDomain = SSEPackedSingle in
4931  defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
4932                              f128mem, SSE_ALU_F32P>, TB, XD;
4933  let ExeDomain = SSEPackedDouble in
4934  defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128,
4935                              f128mem, SSE_ALU_F64P>, TB, OpSize;
4936}
4937
4938//===---------------------------------------------------------------------===//
4939// SSE3 Instructions
4940//===---------------------------------------------------------------------===//
4941
4942// Horizontal ops
4943multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4944                   X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
4945  def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4946       !if(Is2Addr,
4947         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4948         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4949      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
4950      Sched<[WriteFAdd]>;
4951
4952  def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4953       !if(Is2Addr,
4954         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4955         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4956      [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))],
4957        IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
4958}
4959multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4960                  X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
4961  def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4962       !if(Is2Addr,
4963         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4964         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4965      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
4966      Sched<[WriteFAdd]>;
4967
4968  def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4969       !if(Is2Addr,
4970         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4971         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4972      [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))],
4973        IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
4974}
4975
4976let Predicates = [HasAVX] in {
4977  let ExeDomain = SSEPackedSingle in {
4978    defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
4979                            X86fhadd, 0>, VEX_4V;
4980    defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
4981                            X86fhsub, 0>, VEX_4V;
4982    defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
4983                            X86fhadd, 0>, VEX_4V, VEX_L;
4984    defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
4985                            X86fhsub, 0>, VEX_4V, VEX_L;
4986  }
4987  let ExeDomain = SSEPackedDouble in {
4988    defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
4989                            X86fhadd, 0>, VEX_4V;
4990    defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
4991                            X86fhsub, 0>, VEX_4V;
4992    defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
4993                            X86fhadd, 0>, VEX_4V, VEX_L;
4994    defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
4995                            X86fhsub, 0>, VEX_4V, VEX_L;
4996  }
4997}
4998
4999let Constraints = "$src1 = $dst" in {
5000  let ExeDomain = SSEPackedSingle in {
5001    defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd>;
5002    defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub>;
5003  }
5004  let ExeDomain = SSEPackedDouble in {
5005    defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd>;
5006    defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub>;
5007  }
5008}
5009
5010//===---------------------------------------------------------------------===//
5011// SSSE3 - Packed Absolute Instructions
5012//===---------------------------------------------------------------------===//
5013
5014
5015/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
5016multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
5017                            Intrinsic IntId128> {
5018  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
5019                    (ins VR128:$src),
5020                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5021                    [(set VR128:$dst, (IntId128 VR128:$src))], IIC_SSE_PABS_RR>,
5022                    OpSize, Sched<[WriteVecALU]>;
5023
5024  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
5025                    (ins i128mem:$src),
5026                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5027                    [(set VR128:$dst,
5028                      (IntId128
5029                       (bitconvert (memopv2i64 addr:$src))))], IIC_SSE_PABS_RM>,
5030                    OpSize, Sched<[WriteVecALULd]>;
5031}
5032
5033/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
5034multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr,
5035                              Intrinsic IntId256> {
5036  def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
5037                    (ins VR256:$src),
5038                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5039                    [(set VR256:$dst, (IntId256 VR256:$src))]>,
5040                    OpSize, Sched<[WriteVecALU]>;
5041
5042  def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
5043                    (ins i256mem:$src),
5044                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5045                    [(set VR256:$dst,
5046                      (IntId256
5047                       (bitconvert (memopv4i64 addr:$src))))]>, OpSize,
5048                    Sched<[WriteVecALULd]>;
5049}
5050
5051// Helper fragments to match sext vXi1 to vXiY.
5052def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
5053                                               VR128:$src))>;
5054def v8i1sextv8i16  : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i8 15)))>;
5055def v4i1sextv4i32  : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i8 31)))>;
5056def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
5057                                               VR256:$src))>;
5058def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>;
5059def v8i1sextv8i32  : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>;
5060
5061let Predicates = [HasAVX] in {
5062  defm VPABSB  : SS3I_unop_rm_int<0x1C, "vpabsb",
5063                                  int_x86_ssse3_pabs_b_128>, VEX;
5064  defm VPABSW  : SS3I_unop_rm_int<0x1D, "vpabsw",
5065                                  int_x86_ssse3_pabs_w_128>, VEX;
5066  defm VPABSD  : SS3I_unop_rm_int<0x1E, "vpabsd",
5067                                  int_x86_ssse3_pabs_d_128>, VEX;
5068
5069  def : Pat<(xor
5070            (bc_v2i64 (v16i1sextv16i8)),
5071            (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
5072            (VPABSBrr128 VR128:$src)>;
5073  def : Pat<(xor
5074            (bc_v2i64 (v8i1sextv8i16)),
5075            (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
5076            (VPABSWrr128 VR128:$src)>;
5077  def : Pat<(xor
5078            (bc_v2i64 (v4i1sextv4i32)),
5079            (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
5080            (VPABSDrr128 VR128:$src)>;
5081}
5082
5083let Predicates = [HasAVX2] in {
5084  defm VPABSB  : SS3I_unop_rm_int_y<0x1C, "vpabsb",
5085                                    int_x86_avx2_pabs_b>, VEX, VEX_L;
5086  defm VPABSW  : SS3I_unop_rm_int_y<0x1D, "vpabsw",
5087                                    int_x86_avx2_pabs_w>, VEX, VEX_L;
5088  defm VPABSD  : SS3I_unop_rm_int_y<0x1E, "vpabsd",
5089                                    int_x86_avx2_pabs_d>, VEX, VEX_L;
5090
5091  def : Pat<(xor
5092            (bc_v4i64 (v32i1sextv32i8)),
5093            (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))),
5094            (VPABSBrr256 VR256:$src)>;
5095  def : Pat<(xor
5096            (bc_v4i64 (v16i1sextv16i16)),
5097            (bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))),
5098            (VPABSWrr256 VR256:$src)>;
5099  def : Pat<(xor
5100            (bc_v4i64 (v8i1sextv8i32)),
5101            (bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))),
5102            (VPABSDrr256 VR256:$src)>;
5103}
5104
5105defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb",
5106                              int_x86_ssse3_pabs_b_128>;
5107defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw",
5108                              int_x86_ssse3_pabs_w_128>;
5109defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd",
5110                              int_x86_ssse3_pabs_d_128>;
5111
5112let Predicates = [HasSSSE3] in {
5113  def : Pat<(xor
5114            (bc_v2i64 (v16i1sextv16i8)),
5115            (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
5116            (PABSBrr128 VR128:$src)>;
5117  def : Pat<(xor
5118            (bc_v2i64 (v8i1sextv8i16)),
5119            (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
5120            (PABSWrr128 VR128:$src)>;
5121  def : Pat<(xor
5122            (bc_v2i64 (v4i1sextv4i32)),
5123            (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
5124            (PABSDrr128 VR128:$src)>;
5125}
5126
5127//===---------------------------------------------------------------------===//
5128// SSSE3 - Packed Binary Operator Instructions
5129//===---------------------------------------------------------------------===//
5130
5131let Sched = WriteVecALU in {
5132def SSE_PHADDSUBD : OpndItins<
5133  IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM
5134>;
5135def SSE_PHADDSUBSW : OpndItins<
5136  IIC_SSE_PHADDSUBSW_RR, IIC_SSE_PHADDSUBSW_RM
5137>;
5138def SSE_PHADDSUBW : OpndItins<
5139  IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM
5140>;
5141}
5142let Sched = WriteShuffle in
5143def SSE_PSHUFB : OpndItins<
5144  IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM
5145>;
5146let Sched = WriteVecALU in
5147def SSE_PSIGN : OpndItins<
5148  IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM
5149>;
5150let Sched = WriteVecIMul in
5151def SSE_PMULHRSW : OpndItins<
5152  IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW
5153>;
5154
5155/// SS3I_binop_rm - Simple SSSE3 bin op
5156multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
5157                         ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5158                         X86MemOperand x86memop, OpndItins itins,
5159                         bit Is2Addr = 1> {
5160  let isCommutable = 1 in
5161  def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
5162       (ins RC:$src1, RC:$src2),
5163       !if(Is2Addr,
5164         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5165         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5166       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
5167       OpSize, Sched<[itins.Sched]>;
5168  def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
5169       (ins RC:$src1, x86memop:$src2),
5170       !if(Is2Addr,
5171         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5172         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5173       [(set RC:$dst,
5174         (OpVT (OpNode RC:$src1,
5175          (bitconvert (memop_frag addr:$src2)))))], itins.rm>, OpSize,
5176       Sched<[itins.Sched.Folded, ReadAfterLd]>;
5177}
5178
5179/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
5180multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
5181                             Intrinsic IntId128, OpndItins itins,
5182                             bit Is2Addr = 1> {
5183  let isCommutable = 1 in
5184  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
5185       (ins VR128:$src1, VR128:$src2),
5186       !if(Is2Addr,
5187         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5188         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5189       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
5190       OpSize, Sched<[itins.Sched]>;
5191  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
5192       (ins VR128:$src1, i128mem:$src2),
5193       !if(Is2Addr,
5194         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5195         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5196       [(set VR128:$dst,
5197         (IntId128 VR128:$src1,
5198          (bitconvert (memopv2i64 addr:$src2))))]>, OpSize,
5199       Sched<[itins.Sched.Folded, ReadAfterLd]>;
5200}
5201
5202multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
5203                               Intrinsic IntId256> {
5204  let isCommutable = 1 in
5205  def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
5206       (ins VR256:$src1, VR256:$src2),
5207       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5208       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
5209       OpSize;
5210  def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
5211       (ins VR256:$src1, i256mem:$src2),
5212       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5213       [(set VR256:$dst,
5214         (IntId256 VR256:$src1,
5215          (bitconvert (loadv4i64 addr:$src2))))]>, OpSize;
5216}
5217
5218let ImmT = NoImm, Predicates = [HasAVX] in {
5219let isCommutable = 0 in {
5220  defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128,
5221                                  loadv2i64, i128mem,
5222                                  SSE_PHADDSUBW, 0>, VEX_4V;
5223  defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128,
5224                                  loadv2i64, i128mem,
5225                                  SSE_PHADDSUBD, 0>, VEX_4V;
5226  defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128,
5227                                  loadv2i64, i128mem,
5228                                  SSE_PHADDSUBW, 0>, VEX_4V;
5229  defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128,
5230                                  loadv2i64, i128mem,
5231                                  SSE_PHADDSUBD, 0>, VEX_4V;
5232  defm VPSIGNB    : SS3I_binop_rm<0x08, "vpsignb", X86psign, v16i8, VR128,
5233                                  loadv2i64, i128mem,
5234                                  SSE_PSIGN, 0>, VEX_4V;
5235  defm VPSIGNW    : SS3I_binop_rm<0x09, "vpsignw", X86psign, v8i16, VR128,
5236                                  loadv2i64, i128mem,
5237                                  SSE_PSIGN, 0>, VEX_4V;
5238  defm VPSIGND    : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v4i32, VR128,
5239                                  loadv2i64, i128mem,
5240                                  SSE_PSIGN, 0>, VEX_4V;
5241  defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128,
5242                                  loadv2i64, i128mem,
5243                                  SSE_PSHUFB, 0>, VEX_4V;
5244  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
5245                                      int_x86_ssse3_phadd_sw_128,
5246                                      SSE_PHADDSUBSW, 0>, VEX_4V;
5247  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
5248                                      int_x86_ssse3_phsub_sw_128,
5249                                      SSE_PHADDSUBSW, 0>, VEX_4V;
5250  defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw",
5251                                      int_x86_ssse3_pmadd_ub_sw_128,
5252                                      SSE_PMADD, 0>, VEX_4V;
5253}
5254defm VPMULHRSW    : SS3I_binop_rm_int<0x0B, "vpmulhrsw",
5255                                      int_x86_ssse3_pmul_hr_sw_128,
5256                                      SSE_PMULHRSW, 0>, VEX_4V;
5257}
5258
5259let ImmT = NoImm, Predicates = [HasAVX2] in {
5260let isCommutable = 0 in {
5261  defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256,
5262                                  loadv4i64, i256mem,
5263                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5264  defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256,
5265                                  loadv4i64, i256mem,
5266                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5267  defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256,
5268                                  loadv4i64, i256mem,
5269                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5270  defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256,
5271                                  loadv4i64, i256mem,
5272                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5273  defm VPSIGNBY   : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256,
5274                                  loadv4i64, i256mem,
5275                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5276  defm VPSIGNWY   : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256,
5277                                  loadv4i64, i256mem,
5278                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5279  defm VPSIGNDY   : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256,
5280                                  loadv4i64, i256mem,
5281                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5282  defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256,
5283                                  loadv4i64, i256mem,
5284                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5285  defm VPHADDSW   : SS3I_binop_rm_int_y<0x03, "vphaddsw",
5286                                        int_x86_avx2_phadd_sw>, VEX_4V, VEX_L;
5287  defm VPHSUBSW   : SS3I_binop_rm_int_y<0x07, "vphsubsw",
5288                                        int_x86_avx2_phsub_sw>, VEX_4V, VEX_L;
5289  defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw",
5290                                       int_x86_avx2_pmadd_ub_sw>, VEX_4V, VEX_L;
5291}
5292defm VPMULHRSW    : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw",
5293                                        int_x86_avx2_pmul_hr_sw>, VEX_4V, VEX_L;
5294}
5295
5296// None of these have i8 immediate fields.
5297let ImmT = NoImm, Constraints = "$src1 = $dst" in {
5298let isCommutable = 0 in {
5299  defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, VR128,
5300                                 memopv2i64, i128mem, SSE_PHADDSUBW>;
5301  defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, VR128,
5302                                 memopv2i64, i128mem, SSE_PHADDSUBD>;
5303  defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, VR128,
5304                                 memopv2i64, i128mem, SSE_PHADDSUBW>;
5305  defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, VR128,
5306                                 memopv2i64, i128mem, SSE_PHADDSUBD>;
5307  defm PSIGNB    : SS3I_binop_rm<0x08, "psignb", X86psign, v16i8, VR128,
5308                                 memopv2i64, i128mem, SSE_PSIGN>;
5309  defm PSIGNW    : SS3I_binop_rm<0x09, "psignw", X86psign, v8i16, VR128,
5310                                 memopv2i64, i128mem, SSE_PSIGN>;
5311  defm PSIGND    : SS3I_binop_rm<0x0A, "psignd", X86psign, v4i32, VR128,
5312                                 memopv2i64, i128mem, SSE_PSIGN>;
5313  defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, VR128,
5314                                 memopv2i64, i128mem, SSE_PSHUFB>;
5315  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
5316                                     int_x86_ssse3_phadd_sw_128,
5317                                     SSE_PHADDSUBSW>;
5318  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
5319                                     int_x86_ssse3_phsub_sw_128,
5320                                     SSE_PHADDSUBSW>;
5321  defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw",
5322                                     int_x86_ssse3_pmadd_ub_sw_128, SSE_PMADD>;
5323}
5324defm PMULHRSW    : SS3I_binop_rm_int<0x0B, "pmulhrsw",
5325                                     int_x86_ssse3_pmul_hr_sw_128,
5326                                     SSE_PMULHRSW>;
5327}
5328
5329//===---------------------------------------------------------------------===//
5330// SSSE3 - Packed Align Instruction Patterns
5331//===---------------------------------------------------------------------===//
5332
5333multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
5334  let neverHasSideEffects = 1 in {
5335  def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
5336      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
5337      !if(Is2Addr,
5338        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5339        !strconcat(asm,
5340                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5341      [], IIC_SSE_PALIGNRR>, OpSize, Sched<[WriteShuffle]>;
5342  let mayLoad = 1 in
5343  def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
5344      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
5345      !if(Is2Addr,
5346        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5347        !strconcat(asm,
5348                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5349      [], IIC_SSE_PALIGNRM>, OpSize, Sched<[WriteShuffleLd, ReadAfterLd]>;
5350  }
5351}
5352
5353multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
5354  let neverHasSideEffects = 1 in {
5355  def R256rr : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst),
5356      (ins VR256:$src1, VR256:$src2, i8imm:$src3),
5357      !strconcat(asm,
5358                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5359      []>, OpSize, Sched<[WriteShuffle]>;
5360  let mayLoad = 1 in
5361  def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst),
5362      (ins VR256:$src1, i256mem:$src2, i8imm:$src3),
5363      !strconcat(asm,
5364                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5365      []>, OpSize, Sched<[WriteShuffleLd, ReadAfterLd]>;
5366  }
5367}
5368
5369let Predicates = [HasAVX] in
5370  defm VPALIGN : ssse3_palignr<"vpalignr", 0>, VEX_4V;
5371let Predicates = [HasAVX2] in
5372  defm VPALIGN : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L;
5373let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
5374  defm PALIGN : ssse3_palignr<"palignr">;
5375
5376let Predicates = [HasAVX2] in {
5377def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5378          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
5379def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5380          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
5381def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5382          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
5383def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5384          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
5385}
5386
5387let Predicates = [HasAVX] in {
5388def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5389          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5390def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5391          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5392def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5393          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5394def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5395          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5396}
5397
5398let Predicates = [UseSSSE3] in {
5399def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5400          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5401def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5402          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5403def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5404          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5405def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5406          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5407}
5408
5409//===---------------------------------------------------------------------===//
5410// SSSE3 - Thread synchronization
5411//===---------------------------------------------------------------------===//
5412
5413let SchedRW = [WriteSystem] in {
5414let usesCustomInserter = 1 in {
5415def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
5416                [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
5417                Requires<[HasSSE3]>;
5418}
5419
5420let Uses = [EAX, ECX, EDX] in
5421def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>,
5422                 TB, Requires<[HasSSE3]>;
5423let Uses = [ECX, EAX] in
5424def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
5425                [(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>,
5426                TB, Requires<[HasSSE3]>;
5427} // SchedRW
5428
5429def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[In32BitMode]>;
5430def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
5431
5432def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>,
5433      Requires<[In32BitMode]>;
5434def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
5435      Requires<[In64BitMode]>;
5436
5437//===----------------------------------------------------------------------===//
5438// SSE4.1 - Packed Move with Sign/Zero Extend
5439//===----------------------------------------------------------------------===//
5440
5441multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId,
5442                               OpndItins itins = DEFAULT_ITINS> {
5443  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5444                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5445                 [(set VR128:$dst, (IntId VR128:$src))], itins.rr>, OpSize;
5446
5447  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
5448                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5449       [(set VR128:$dst,
5450         (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))],
5451         itins.rm>, OpSize;
5452}
5453
5454multiclass SS41I_binop_rm_int16_y<bits<8> opc, string OpcodeStr,
5455                                 Intrinsic IntId> {
5456  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
5457                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5458                  [(set VR256:$dst, (IntId VR128:$src))]>, OpSize;
5459
5460  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
5461                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5462                  [(set VR256:$dst, (IntId (load addr:$src)))]>,
5463                  OpSize;
5464}
5465
5466let Predicates = [HasAVX] in {
5467defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw",
5468                                     int_x86_sse41_pmovsxbw>, VEX;
5469defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd",
5470                                     int_x86_sse41_pmovsxwd>, VEX;
5471defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq",
5472                                     int_x86_sse41_pmovsxdq>, VEX;
5473defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw",
5474                                     int_x86_sse41_pmovzxbw>, VEX;
5475defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd",
5476                                     int_x86_sse41_pmovzxwd>, VEX;
5477defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq",
5478                                     int_x86_sse41_pmovzxdq>, VEX;
5479}
5480
5481let Predicates = [HasAVX2] in {
5482defm VPMOVSXBW : SS41I_binop_rm_int16_y<0x20, "vpmovsxbw",
5483                                        int_x86_avx2_pmovsxbw>, VEX, VEX_L;
5484defm VPMOVSXWD : SS41I_binop_rm_int16_y<0x23, "vpmovsxwd",
5485                                        int_x86_avx2_pmovsxwd>, VEX, VEX_L;
5486defm VPMOVSXDQ : SS41I_binop_rm_int16_y<0x25, "vpmovsxdq",
5487                                        int_x86_avx2_pmovsxdq>, VEX, VEX_L;
5488defm VPMOVZXBW : SS41I_binop_rm_int16_y<0x30, "vpmovzxbw",
5489                                        int_x86_avx2_pmovzxbw>, VEX, VEX_L;
5490defm VPMOVZXWD : SS41I_binop_rm_int16_y<0x33, "vpmovzxwd",
5491                                        int_x86_avx2_pmovzxwd>, VEX, VEX_L;
5492defm VPMOVZXDQ : SS41I_binop_rm_int16_y<0x35, "vpmovzxdq",
5493                                        int_x86_avx2_pmovzxdq>, VEX, VEX_L;
5494}
5495
5496defm PMOVSXBW   : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw,                                       SSE_INTALU_ITINS_P>;
5497defm PMOVSXWD   : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd,                                       SSE_INTALU_ITINS_P>;
5498defm PMOVSXDQ   : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq,                                       SSE_INTALU_ITINS_P>;
5499defm PMOVZXBW   : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw,                                       SSE_INTALU_ITINS_P>;
5500defm PMOVZXWD   : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd,                                       SSE_INTALU_ITINS_P>;
5501defm PMOVZXDQ   : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq,                                       SSE_INTALU_ITINS_P>;
5502
5503let Predicates = [HasAVX] in {
5504  // Common patterns involving scalar load.
5505  def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
5506            (VPMOVSXBWrm addr:$src)>;
5507  def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
5508            (VPMOVSXBWrm addr:$src)>;
5509  def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))),
5510            (VPMOVSXBWrm addr:$src)>;
5511
5512  def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
5513            (VPMOVSXWDrm addr:$src)>;
5514  def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
5515            (VPMOVSXWDrm addr:$src)>;
5516  def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))),
5517            (VPMOVSXWDrm addr:$src)>;
5518
5519  def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
5520            (VPMOVSXDQrm addr:$src)>;
5521  def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
5522            (VPMOVSXDQrm addr:$src)>;
5523  def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))),
5524            (VPMOVSXDQrm addr:$src)>;
5525
5526  def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
5527            (VPMOVZXBWrm addr:$src)>;
5528  def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
5529            (VPMOVZXBWrm addr:$src)>;
5530  def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))),
5531            (VPMOVZXBWrm addr:$src)>;
5532
5533  def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
5534            (VPMOVZXWDrm addr:$src)>;
5535  def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
5536            (VPMOVZXWDrm addr:$src)>;
5537  def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))),
5538            (VPMOVZXWDrm addr:$src)>;
5539
5540  def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
5541            (VPMOVZXDQrm addr:$src)>;
5542  def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
5543            (VPMOVZXDQrm addr:$src)>;
5544  def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))),
5545            (VPMOVZXDQrm addr:$src)>;
5546}
5547
5548let Predicates = [UseSSE41] in {
5549  // Common patterns involving scalar load.
5550  def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
5551            (PMOVSXBWrm addr:$src)>;
5552  def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
5553            (PMOVSXBWrm addr:$src)>;
5554  def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))),
5555            (PMOVSXBWrm addr:$src)>;
5556
5557  def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
5558            (PMOVSXWDrm addr:$src)>;
5559  def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
5560            (PMOVSXWDrm addr:$src)>;
5561  def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))),
5562            (PMOVSXWDrm addr:$src)>;
5563
5564  def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
5565            (PMOVSXDQrm addr:$src)>;
5566  def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
5567            (PMOVSXDQrm addr:$src)>;
5568  def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))),
5569            (PMOVSXDQrm addr:$src)>;
5570
5571  def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
5572            (PMOVZXBWrm addr:$src)>;
5573  def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
5574            (PMOVZXBWrm addr:$src)>;
5575  def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))),
5576            (PMOVZXBWrm addr:$src)>;
5577
5578  def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
5579            (PMOVZXWDrm addr:$src)>;
5580  def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
5581            (PMOVZXWDrm addr:$src)>;
5582  def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))),
5583            (PMOVZXWDrm addr:$src)>;
5584
5585  def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
5586            (PMOVZXDQrm addr:$src)>;
5587  def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
5588            (PMOVZXDQrm addr:$src)>;
5589  def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))),
5590            (PMOVZXDQrm addr:$src)>;
5591}
5592
5593let Predicates = [HasAVX2] in {
5594  let AddedComplexity = 15 in {
5595    def : Pat<(v4i64 (X86vzmovly (v4i32 VR128:$src))),
5596              (VPMOVZXDQYrr VR128:$src)>;
5597    def : Pat<(v8i32 (X86vzmovly (v8i16 VR128:$src))),
5598              (VPMOVZXWDYrr VR128:$src)>;
5599    def : Pat<(v16i16 (X86vzmovly (v16i8 VR128:$src))),
5600              (VPMOVZXBWYrr VR128:$src)>;
5601  }
5602
5603  def : Pat<(v4i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>;
5604  def : Pat<(v8i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>;
5605  def : Pat<(v16i16 (X86vsmovl (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>;
5606}
5607
5608let Predicates = [HasAVX] in {
5609  def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
5610  def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
5611  def : Pat<(v8i16 (X86vsmovl (v16i8 VR128:$src))), (VPMOVSXBWrr VR128:$src)>;
5612}
5613
5614let Predicates = [UseSSE41] in {
5615  def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
5616  def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
5617  def : Pat<(v8i16 (X86vsmovl (v16i8 VR128:$src))), (PMOVSXBWrr VR128:$src)>;
5618}
5619
5620
5621multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId,
5622                               OpndItins itins = DEFAULT_ITINS> {
5623  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5624                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5625                 [(set VR128:$dst, (IntId VR128:$src))], itins.rr>, OpSize;
5626
5627  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
5628                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5629       [(set VR128:$dst,
5630         (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))],
5631         itins.rm>,
5632          OpSize;
5633}
5634
5635multiclass SS41I_binop_rm_int8_y<bits<8> opc, string OpcodeStr,
5636                                 Intrinsic IntId> {
5637  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
5638                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5639                  [(set VR256:$dst, (IntId VR128:$src))]>, OpSize;
5640
5641  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i32mem:$src),
5642                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5643       [(set VR256:$dst,
5644         (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>,
5645          OpSize;
5646}
5647
5648let Predicates = [HasAVX] in {
5649defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd>,
5650                                     VEX;
5651defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq>,
5652                                     VEX;
5653defm VPMOVZXBD : SS41I_binop_rm_int4<0x31, "vpmovzxbd", int_x86_sse41_pmovzxbd>,
5654                                     VEX;
5655defm VPMOVZXWQ : SS41I_binop_rm_int4<0x34, "vpmovzxwq", int_x86_sse41_pmovzxwq>,
5656                                     VEX;
5657}
5658
5659let Predicates = [HasAVX2] in {
5660defm VPMOVSXBD : SS41I_binop_rm_int8_y<0x21, "vpmovsxbd",
5661                                       int_x86_avx2_pmovsxbd>, VEX, VEX_L;
5662defm VPMOVSXWQ : SS41I_binop_rm_int8_y<0x24, "vpmovsxwq",
5663                                       int_x86_avx2_pmovsxwq>, VEX, VEX_L;
5664defm VPMOVZXBD : SS41I_binop_rm_int8_y<0x31, "vpmovzxbd",
5665                                       int_x86_avx2_pmovzxbd>, VEX, VEX_L;
5666defm VPMOVZXWQ : SS41I_binop_rm_int8_y<0x34, "vpmovzxwq",
5667                                       int_x86_avx2_pmovzxwq>, VEX, VEX_L;
5668}
5669
5670defm PMOVSXBD   : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd,
5671                                      SSE_INTALU_ITINS_P>;
5672defm PMOVSXWQ   : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq,
5673                                      SSE_INTALU_ITINS_P>;
5674defm PMOVZXBD   : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd,
5675                                      SSE_INTALU_ITINS_P>;
5676defm PMOVZXWQ   : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq,
5677                                      SSE_INTALU_ITINS_P>;
5678
5679let Predicates = [HasAVX] in {
5680  // Common patterns involving scalar load
5681  def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
5682            (VPMOVSXBDrm addr:$src)>;
5683  def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
5684            (VPMOVSXWQrm addr:$src)>;
5685
5686  def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
5687            (VPMOVZXBDrm addr:$src)>;
5688  def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
5689            (VPMOVZXWQrm addr:$src)>;
5690}
5691
5692let Predicates = [UseSSE41] in {
5693  // Common patterns involving scalar load
5694  def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
5695            (PMOVSXBDrm addr:$src)>;
5696  def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
5697            (PMOVSXWQrm addr:$src)>;
5698
5699  def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
5700            (PMOVZXBDrm addr:$src)>;
5701  def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
5702            (PMOVZXWQrm addr:$src)>;
5703}
5704
5705multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId,
5706                               OpndItins itins = DEFAULT_ITINS> {
5707  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5708                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5709                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
5710
5711  // Expecting a i16 load any extended to i32 value.
5712  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i16mem:$src),
5713                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5714                 [(set VR128:$dst, (IntId (bitconvert
5715                     (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))]>,
5716                 OpSize;
5717}
5718
5719multiclass SS41I_binop_rm_int4_y<bits<8> opc, string OpcodeStr,
5720                                 Intrinsic IntId> {
5721  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
5722                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5723                 [(set VR256:$dst, (IntId VR128:$src))]>, OpSize;
5724
5725  // Expecting a i16 load any extended to i32 value.
5726  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i16mem:$src),
5727                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5728                  [(set VR256:$dst, (IntId (bitconvert
5729                      (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>,
5730                  OpSize;
5731}
5732
5733let Predicates = [HasAVX] in {
5734defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq>,
5735                                     VEX;
5736defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq>,
5737                                     VEX;
5738}
5739let Predicates = [HasAVX2] in {
5740defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq",
5741                                       int_x86_avx2_pmovsxbq>, VEX, VEX_L;
5742defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq",
5743                                       int_x86_avx2_pmovzxbq>, VEX, VEX_L;
5744}
5745defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq,
5746                                      SSE_INTALU_ITINS_P>;
5747defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq,
5748                                      SSE_INTALU_ITINS_P>;
5749
5750let Predicates = [HasAVX2] in {
5751  def : Pat<(v16i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>;
5752  def : Pat<(v8i32  (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDYrr VR128:$src)>;
5753  def : Pat<(v4i64  (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQYrr VR128:$src)>;
5754
5755  def : Pat<(v8i32  (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>;
5756  def : Pat<(v4i64  (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQYrr VR128:$src)>;
5757
5758  def : Pat<(v4i64  (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>;
5759
5760  def : Pat<(v16i16 (X86vsext (v32i8 VR256:$src))),
5761            (VPMOVSXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
5762  def : Pat<(v8i32 (X86vsext (v32i8 VR256:$src))),
5763            (VPMOVSXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
5764  def : Pat<(v4i64 (X86vsext (v32i8 VR256:$src))),
5765            (VPMOVSXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
5766
5767  def : Pat<(v8i32 (X86vsext (v16i16 VR256:$src))),
5768            (VPMOVSXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
5769  def : Pat<(v4i64 (X86vsext (v16i16 VR256:$src))),
5770            (VPMOVSXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
5771
5772  def : Pat<(v4i64 (X86vsext (v8i32 VR256:$src))),
5773            (VPMOVSXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
5774
5775  def : Pat<(v8i32 (X86vsmovl (v8i16 (bitconvert (v2i64 (load addr:$src)))))),
5776            (VPMOVSXWDYrm addr:$src)>;
5777  def : Pat<(v4i64 (X86vsmovl (v4i32 (bitconvert (v2i64 (load addr:$src)))))),
5778            (VPMOVSXDQYrm addr:$src)>;
5779
5780  def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2i64 
5781                    (scalar_to_vector (loadi64 addr:$src))))))),
5782            (VPMOVSXBDYrm addr:$src)>;
5783  def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2f64 
5784                    (scalar_to_vector (loadf64 addr:$src))))))),
5785            (VPMOVSXBDYrm addr:$src)>;
5786
5787  def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2i64 
5788                    (scalar_to_vector (loadi64 addr:$src))))))),
5789            (VPMOVSXWQYrm addr:$src)>;
5790  def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2f64 
5791                    (scalar_to_vector (loadf64 addr:$src))))))),
5792            (VPMOVSXWQYrm addr:$src)>;
5793
5794  def : Pat<(v4i64 (X86vsext (v16i8 (bitconvert (v4i32 
5795                    (scalar_to_vector (loadi32 addr:$src))))))),
5796            (VPMOVSXBQYrm addr:$src)>;
5797}
5798
5799let Predicates = [HasAVX] in {
5800  // Common patterns involving scalar load
5801  def : Pat<(int_x86_sse41_pmovsxbq
5802              (bitconvert (v4i32 (X86vzmovl
5803                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
5804            (VPMOVSXBQrm addr:$src)>;
5805
5806  def : Pat<(int_x86_sse41_pmovzxbq
5807              (bitconvert (v4i32 (X86vzmovl
5808                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
5809            (VPMOVZXBQrm addr:$src)>;
5810}
5811
5812let Predicates = [UseSSE41] in {
5813  def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (PMOVSXBWrr VR128:$src)>;
5814  def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (PMOVSXBDrr VR128:$src)>;
5815  def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (PMOVSXBQrr VR128:$src)>;
5816
5817  def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
5818  def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (PMOVSXWQrr VR128:$src)>;
5819
5820  def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
5821
5822  // Common patterns involving scalar load
5823  def : Pat<(int_x86_sse41_pmovsxbq
5824              (bitconvert (v4i32 (X86vzmovl
5825                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
5826            (PMOVSXBQrm addr:$src)>;
5827
5828  def : Pat<(int_x86_sse41_pmovzxbq
5829              (bitconvert (v4i32 (X86vzmovl
5830                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
5831            (PMOVZXBQrm addr:$src)>;
5832
5833  def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64
5834                    (scalar_to_vector (loadi64 addr:$src))))))),
5835            (PMOVSXWDrm addr:$src)>;
5836  def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64
5837                    (scalar_to_vector (loadf64 addr:$src))))))),
5838            (PMOVSXWDrm addr:$src)>;
5839  def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32
5840                    (scalar_to_vector (loadi32 addr:$src))))))),
5841            (PMOVSXBDrm addr:$src)>;
5842  def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32
5843                    (scalar_to_vector (loadi32 addr:$src))))))),
5844            (PMOVSXWQrm addr:$src)>;
5845  def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32
5846                    (scalar_to_vector (extloadi32i16 addr:$src))))))),
5847            (PMOVSXBQrm addr:$src)>;
5848  def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64
5849                    (scalar_to_vector (loadi64 addr:$src))))))),
5850            (PMOVSXDQrm addr:$src)>;
5851  def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64
5852                    (scalar_to_vector (loadf64 addr:$src))))))),
5853            (PMOVSXDQrm addr:$src)>;
5854  def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64
5855                    (scalar_to_vector (loadi64 addr:$src))))))),
5856            (PMOVSXBWrm addr:$src)>;
5857  def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64
5858                    (scalar_to_vector (loadf64 addr:$src))))))),
5859            (PMOVSXBWrm addr:$src)>;
5860}
5861
5862let Predicates = [HasAVX2] in {
5863  def : Pat<(v16i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWYrr VR128:$src)>;
5864  def : Pat<(v8i32  (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDYrr VR128:$src)>;
5865  def : Pat<(v4i64  (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQYrr VR128:$src)>;
5866
5867  def : Pat<(v8i32  (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDYrr VR128:$src)>;
5868  def : Pat<(v4i64  (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQYrr VR128:$src)>;
5869
5870  def : Pat<(v4i64  (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQYrr VR128:$src)>;
5871
5872  def : Pat<(v16i16 (X86vzext (v32i8 VR256:$src))),
5873            (VPMOVZXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
5874  def : Pat<(v8i32 (X86vzext (v32i8 VR256:$src))),
5875            (VPMOVZXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
5876  def : Pat<(v4i64 (X86vzext (v32i8 VR256:$src))),
5877            (VPMOVZXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
5878
5879  def : Pat<(v8i32 (X86vzext (v16i16 VR256:$src))),
5880            (VPMOVZXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
5881  def : Pat<(v4i64 (X86vzext (v16i16 VR256:$src))),
5882            (VPMOVZXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
5883
5884  def : Pat<(v4i64 (X86vzext (v8i32 VR256:$src))),
5885            (VPMOVZXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
5886}
5887
5888let Predicates = [HasAVX] in {
5889  def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWrr VR128:$src)>;
5890  def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDrr VR128:$src)>;
5891  def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQrr VR128:$src)>;
5892
5893  def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDrr VR128:$src)>;
5894  def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQrr VR128:$src)>;
5895
5896  def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQrr VR128:$src)>;
5897
5898  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
5899            (VPMOVZXBWrm addr:$src)>;
5900  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
5901            (VPMOVZXBWrm addr:$src)>;
5902  def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
5903            (VPMOVZXBDrm addr:$src)>;
5904  def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))),
5905            (VPMOVZXBQrm addr:$src)>;
5906
5907  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
5908            (VPMOVZXWDrm addr:$src)>;
5909  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
5910            (VPMOVZXWDrm addr:$src)>;
5911  def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
5912            (VPMOVZXWQrm addr:$src)>;
5913
5914  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
5915            (VPMOVZXDQrm addr:$src)>;
5916  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
5917            (VPMOVZXDQrm addr:$src)>;
5918  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
5919            (VPMOVZXDQrm addr:$src)>;
5920
5921  def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWrr VR128:$src)>;
5922  def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDrr VR128:$src)>;
5923  def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQrr VR128:$src)>;
5924
5925  def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
5926  def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQrr VR128:$src)>;
5927
5928  def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
5929
5930  def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64
5931                    (scalar_to_vector (loadi64 addr:$src))))))),
5932            (VPMOVSXWDrm addr:$src)>;
5933  def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64
5934                    (scalar_to_vector (loadi64 addr:$src))))))),
5935            (VPMOVSXDQrm addr:$src)>;
5936  def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64
5937                    (scalar_to_vector (loadf64 addr:$src))))))),
5938            (VPMOVSXWDrm addr:$src)>;
5939  def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64
5940                    (scalar_to_vector (loadf64 addr:$src))))))),
5941            (VPMOVSXDQrm addr:$src)>;
5942  def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64
5943                    (scalar_to_vector (loadi64 addr:$src))))))),
5944            (VPMOVSXBWrm addr:$src)>;
5945  def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64
5946                    (scalar_to_vector (loadf64 addr:$src))))))),
5947            (VPMOVSXBWrm addr:$src)>;
5948
5949  def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32
5950                    (scalar_to_vector (loadi32 addr:$src))))))),
5951            (VPMOVSXBDrm addr:$src)>;
5952  def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32
5953                    (scalar_to_vector (loadi32 addr:$src))))))),
5954            (VPMOVSXWQrm addr:$src)>;
5955  def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32
5956                    (scalar_to_vector (extloadi32i16 addr:$src))))))),
5957            (VPMOVSXBQrm addr:$src)>;
5958}
5959
5960let Predicates = [UseSSE41] in {
5961  def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (PMOVZXBWrr VR128:$src)>;
5962  def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (PMOVZXBDrr VR128:$src)>;
5963  def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (PMOVZXBQrr VR128:$src)>;
5964
5965  def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (PMOVZXWDrr VR128:$src)>;
5966  def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (PMOVZXWQrr VR128:$src)>;
5967
5968  def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (PMOVZXDQrr VR128:$src)>;
5969
5970  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
5971            (PMOVZXBWrm addr:$src)>;
5972  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
5973            (PMOVZXBWrm addr:$src)>;
5974  def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
5975            (PMOVZXBDrm addr:$src)>;
5976  def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))),
5977            (PMOVZXBQrm addr:$src)>;
5978
5979  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
5980            (PMOVZXWDrm addr:$src)>;
5981  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
5982            (PMOVZXWDrm addr:$src)>;
5983  def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
5984            (PMOVZXWQrm addr:$src)>;
5985
5986  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
5987            (PMOVZXDQrm addr:$src)>;
5988  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
5989            (PMOVZXDQrm addr:$src)>;
5990  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
5991            (PMOVZXDQrm addr:$src)>;
5992}
5993
5994//===----------------------------------------------------------------------===//
5995// SSE4.1 - Extract Instructions
5996//===----------------------------------------------------------------------===//
5997
5998/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
5999multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
6000  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
6001                 (ins VR128:$src1, i32i8imm:$src2),
6002                 !strconcat(OpcodeStr,
6003                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6004                 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
6005                                         imm:$src2))]>,
6006                 OpSize;
6007  let neverHasSideEffects = 1, mayStore = 1 in
6008  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6009                 (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2),
6010                 !strconcat(OpcodeStr,
6011                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6012                 []>, OpSize;
6013// FIXME:
6014// There's an AssertZext in the way of writing the store pattern
6015// (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
6016}
6017
6018let Predicates = [HasAVX] in
6019  defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
6020
6021defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
6022
6023
6024/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
6025multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
6026  let isCodeGenOnly = 1, hasSideEffects = 0 in
6027  def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
6028                   (ins VR128:$src1, i32i8imm:$src2),
6029                   !strconcat(OpcodeStr,
6030                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6031                   []>, OpSize;
6032
6033  let neverHasSideEffects = 1, mayStore = 1 in
6034  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6035                 (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2),
6036                 !strconcat(OpcodeStr,
6037                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6038                 []>, OpSize;
6039// FIXME:
6040// There's an AssertZext in the way of writing the store pattern
6041// (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
6042}
6043
6044let Predicates = [HasAVX] in
6045  defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
6046
6047defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
6048
6049
6050/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
6051multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
6052  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
6053                 (ins VR128:$src1, i32i8imm:$src2),
6054                 !strconcat(OpcodeStr,
6055                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6056                 [(set GR32:$dst,
6057                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>, OpSize;
6058  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6059                 (ins i32mem:$dst, VR128:$src1, i32i8imm:$src2),
6060                 !strconcat(OpcodeStr,
6061                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6062                 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
6063                          addr:$dst)]>, OpSize;
6064}
6065
6066let Predicates = [HasAVX] in
6067  defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
6068
6069defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
6070
6071/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
6072multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
6073  def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
6074                 (ins VR128:$src1, i32i8imm:$src2),
6075                 !strconcat(OpcodeStr,
6076                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6077                 [(set GR64:$dst,
6078                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>, OpSize, REX_W;
6079  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6080                 (ins i64mem:$dst, VR128:$src1, i32i8imm:$src2),
6081                 !strconcat(OpcodeStr,
6082                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6083                 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
6084                          addr:$dst)]>, OpSize, REX_W;
6085}
6086
6087let Predicates = [HasAVX] in
6088  defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
6089
6090defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
6091
6092/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
6093/// destination
6094multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr,
6095                            OpndItins itins = DEFAULT_ITINS> {
6096  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
6097                 (ins VR128:$src1, i32i8imm:$src2),
6098                 !strconcat(OpcodeStr,
6099                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6100                 [(set GR32orGR64:$dst,
6101                    (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))],
6102                    itins.rr>,
6103           OpSize;
6104  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6105                 (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2),
6106                 !strconcat(OpcodeStr,
6107                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6108                 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
6109                          addr:$dst)], itins.rm>, OpSize;
6110}
6111
6112let ExeDomain = SSEPackedSingle in {
6113  let Predicates = [UseAVX] in
6114    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
6115  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>;
6116}
6117
6118// Also match an EXTRACTPS store when the store is done as f32 instead of i32.
6119def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
6120                                              imm:$src2))),
6121                 addr:$dst),
6122          (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
6123          Requires<[HasAVX]>;
6124def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
6125                                              imm:$src2))),
6126                 addr:$dst),
6127          (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
6128          Requires<[UseSSE41]>;
6129
6130//===----------------------------------------------------------------------===//
6131// SSE4.1 - Insert Instructions
6132//===----------------------------------------------------------------------===//
6133
6134multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
6135  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6136      (ins VR128:$src1, GR32orGR64:$src2, i32i8imm:$src3),
6137      !if(Is2Addr,
6138        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6139        !strconcat(asm,
6140                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6141      [(set VR128:$dst,
6142        (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, OpSize;
6143  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6144      (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3),
6145      !if(Is2Addr,
6146        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6147        !strconcat(asm,
6148                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6149      [(set VR128:$dst,
6150        (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
6151                   imm:$src3))]>, OpSize;
6152}
6153
6154let Predicates = [HasAVX] in
6155  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
6156let Constraints = "$src1 = $dst" in
6157  defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
6158
6159multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
6160  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6161      (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
6162      !if(Is2Addr,
6163        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6164        !strconcat(asm,
6165                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6166      [(set VR128:$dst,
6167        (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
6168      OpSize;
6169  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6170      (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3),
6171      !if(Is2Addr,
6172        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6173        !strconcat(asm,
6174                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6175      [(set VR128:$dst,
6176        (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
6177                          imm:$src3)))]>, OpSize;
6178}
6179
6180let Predicates = [HasAVX] in
6181  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
6182let Constraints = "$src1 = $dst" in
6183  defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
6184
6185multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
6186  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6187      (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
6188      !if(Is2Addr,
6189        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6190        !strconcat(asm,
6191                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6192      [(set VR128:$dst,
6193        (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
6194      OpSize;
6195  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6196      (ins VR128:$src1, i64mem:$src2, i32i8imm:$src3),
6197      !if(Is2Addr,
6198        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6199        !strconcat(asm,
6200                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6201      [(set VR128:$dst,
6202        (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
6203                          imm:$src3)))]>, OpSize;
6204}
6205
6206let Predicates = [HasAVX] in
6207  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
6208let Constraints = "$src1 = $dst" in
6209  defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
6210
6211// insertps has a few different modes, there's the first two here below which
6212// are optimized inserts that won't zero arbitrary elements in the destination
6213// vector. The next one matches the intrinsic and could zero arbitrary elements
6214// in the target vector.
6215multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
6216                           OpndItins itins = DEFAULT_ITINS> {
6217  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6218      (ins VR128:$src1, VR128:$src2, u32u8imm:$src3),
6219      !if(Is2Addr,
6220        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6221        !strconcat(asm,
6222                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6223      [(set VR128:$dst,
6224        (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
6225      OpSize;
6226  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6227      (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3),
6228      !if(Is2Addr,
6229        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6230        !strconcat(asm,
6231                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6232      [(set VR128:$dst,
6233        (X86insrtps VR128:$src1,
6234                   (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
6235                    imm:$src3))], itins.rm>, OpSize;
6236}
6237
6238let ExeDomain = SSEPackedSingle in {
6239  let Predicates = [UseAVX] in
6240    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
6241  let Constraints = "$src1 = $dst" in
6242    defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
6243}
6244
6245//===----------------------------------------------------------------------===//
6246// SSE4.1 - Round Instructions
6247//===----------------------------------------------------------------------===//
6248
6249multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
6250                            X86MemOperand x86memop, RegisterClass RC,
6251                            PatFrag mem_frag32, PatFrag mem_frag64,
6252                            Intrinsic V4F32Int, Intrinsic V2F64Int> {
6253let ExeDomain = SSEPackedSingle in {
6254  // Intrinsic operation, reg.
6255  // Vector intrinsic operation, reg
6256  def PSr : SS4AIi8<opcps, MRMSrcReg,
6257                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
6258                    !strconcat(OpcodeStr,
6259                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6260                    [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))],
6261                    IIC_SSE_ROUNDPS_REG>,
6262                    OpSize;
6263
6264  // Vector intrinsic operation, mem
6265  def PSm : SS4AIi8<opcps, MRMSrcMem,
6266                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
6267                    !strconcat(OpcodeStr,
6268                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6269                    [(set RC:$dst,
6270                          (V4F32Int (mem_frag32 addr:$src1),imm:$src2))],
6271                          IIC_SSE_ROUNDPS_MEM>,
6272                    OpSize;
6273} // ExeDomain = SSEPackedSingle
6274
6275let ExeDomain = SSEPackedDouble in {
6276  // Vector intrinsic operation, reg
6277  def PDr : SS4AIi8<opcpd, MRMSrcReg,
6278                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
6279                    !strconcat(OpcodeStr,
6280                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6281                    [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))],
6282                    IIC_SSE_ROUNDPS_REG>,
6283                    OpSize;
6284
6285  // Vector intrinsic operation, mem
6286  def PDm : SS4AIi8<opcpd, MRMSrcMem,
6287                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
6288                    !strconcat(OpcodeStr,
6289                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6290                    [(set RC:$dst,
6291                          (V2F64Int (mem_frag64 addr:$src1),imm:$src2))],
6292                          IIC_SSE_ROUNDPS_REG>,
6293                    OpSize;
6294} // ExeDomain = SSEPackedDouble
6295}
6296
6297multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
6298                            string OpcodeStr,
6299                            Intrinsic F32Int,
6300                            Intrinsic F64Int, bit Is2Addr = 1> {
6301let ExeDomain = GenericDomain in {
6302  // Operation, reg.
6303  let hasSideEffects = 0 in
6304  def SSr : SS4AIi8<opcss, MRMSrcReg,
6305      (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32i8imm:$src3),
6306      !if(Is2Addr,
6307          !strconcat(OpcodeStr,
6308              "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6309          !strconcat(OpcodeStr,
6310              "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6311      []>, OpSize;
6312
6313  // Intrinsic operation, reg.
6314  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
6315        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
6316        !if(Is2Addr,
6317            !strconcat(OpcodeStr,
6318                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6319            !strconcat(OpcodeStr,
6320                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6321        [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
6322        OpSize;
6323
6324  // Intrinsic operation, mem.
6325  def SSm : SS4AIi8<opcss, MRMSrcMem,
6326        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3),
6327        !if(Is2Addr,
6328            !strconcat(OpcodeStr,
6329                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6330            !strconcat(OpcodeStr,
6331                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6332        [(set VR128:$dst,
6333             (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
6334        OpSize;
6335
6336  // Operation, reg.
6337  let hasSideEffects = 0 in
6338  def SDr : SS4AIi8<opcsd, MRMSrcReg,
6339        (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32i8imm:$src3),
6340        !if(Is2Addr,
6341            !strconcat(OpcodeStr,
6342                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6343            !strconcat(OpcodeStr,
6344                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6345        []>, OpSize;
6346
6347  // Intrinsic operation, reg.
6348  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
6349        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
6350        !if(Is2Addr,
6351            !strconcat(OpcodeStr,
6352                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6353            !strconcat(OpcodeStr,
6354                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6355        [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
6356        OpSize;
6357
6358  // Intrinsic operation, mem.
6359  def SDm : SS4AIi8<opcsd, MRMSrcMem,
6360        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3),
6361        !if(Is2Addr,
6362            !strconcat(OpcodeStr,
6363                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6364            !strconcat(OpcodeStr,
6365                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6366        [(set VR128:$dst,
6367              (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
6368        OpSize;
6369} // ExeDomain = GenericDomain
6370}
6371
6372// FP round - roundss, roundps, roundsd, roundpd
6373let Predicates = [HasAVX] in {
6374  // Intrinsic form
6375  defm VROUND  : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128,
6376                                  loadv4f32, loadv2f64,
6377                                  int_x86_sse41_round_ps,
6378                                  int_x86_sse41_round_pd>, VEX;
6379  defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
6380                                  loadv8f32, loadv4f64,
6381                                  int_x86_avx_round_ps_256,
6382                                  int_x86_avx_round_pd_256>, VEX, VEX_L;
6383  defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
6384                                  int_x86_sse41_round_ss,
6385                                  int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
6386
6387  def : Pat<(ffloor FR32:$src),
6388            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
6389  def : Pat<(f64 (ffloor FR64:$src)),
6390            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
6391  def : Pat<(f32 (fnearbyint FR32:$src)),
6392            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
6393  def : Pat<(f64 (fnearbyint FR64:$src)),
6394            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
6395  def : Pat<(f32 (fceil FR32:$src)),
6396            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
6397  def : Pat<(f64 (fceil FR64:$src)),
6398            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
6399  def : Pat<(f32 (frint FR32:$src)),
6400            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
6401  def : Pat<(f64 (frint FR64:$src)),
6402            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
6403  def : Pat<(f32 (ftrunc FR32:$src)),
6404            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
6405  def : Pat<(f64 (ftrunc FR64:$src)),
6406            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
6407
6408  def : Pat<(v4f32 (ffloor VR128:$src)),
6409            (VROUNDPSr VR128:$src, (i32 0x1))>;
6410  def : Pat<(v4f32 (fnearbyint VR128:$src)),
6411            (VROUNDPSr VR128:$src, (i32 0xC))>;
6412  def : Pat<(v4f32 (fceil VR128:$src)),
6413            (VROUNDPSr VR128:$src, (i32 0x2))>;
6414  def : Pat<(v4f32 (frint VR128:$src)),
6415            (VROUNDPSr VR128:$src, (i32 0x4))>;
6416  def : Pat<(v4f32 (ftrunc VR128:$src)),
6417            (VROUNDPSr VR128:$src, (i32 0x3))>;
6418
6419  def : Pat<(v2f64 (ffloor VR128:$src)),
6420            (VROUNDPDr VR128:$src, (i32 0x1))>;
6421  def : Pat<(v2f64 (fnearbyint VR128:$src)),
6422            (VROUNDPDr VR128:$src, (i32 0xC))>;
6423  def : Pat<(v2f64 (fceil VR128:$src)),
6424            (VROUNDPDr VR128:$src, (i32 0x2))>;
6425  def : Pat<(v2f64 (frint VR128:$src)),
6426            (VROUNDPDr VR128:$src, (i32 0x4))>;
6427  def : Pat<(v2f64 (ftrunc VR128:$src)),
6428            (VROUNDPDr VR128:$src, (i32 0x3))>;
6429
6430  def : Pat<(v8f32 (ffloor VR256:$src)),
6431            (VROUNDYPSr VR256:$src, (i32 0x1))>;
6432  def : Pat<(v8f32 (fnearbyint VR256:$src)),
6433            (VROUNDYPSr VR256:$src, (i32 0xC))>;
6434  def : Pat<(v8f32 (fceil VR256:$src)),
6435            (VROUNDYPSr VR256:$src, (i32 0x2))>;
6436  def : Pat<(v8f32 (frint VR256:$src)),
6437            (VROUNDYPSr VR256:$src, (i32 0x4))>;
6438  def : Pat<(v8f32 (ftrunc VR256:$src)),
6439            (VROUNDYPSr VR256:$src, (i32 0x3))>;
6440
6441  def : Pat<(v4f64 (ffloor VR256:$src)),
6442            (VROUNDYPDr VR256:$src, (i32 0x1))>;
6443  def : Pat<(v4f64 (fnearbyint VR256:$src)),
6444            (VROUNDYPDr VR256:$src, (i32 0xC))>;
6445  def : Pat<(v4f64 (fceil VR256:$src)),
6446            (VROUNDYPDr VR256:$src, (i32 0x2))>;
6447  def : Pat<(v4f64 (frint VR256:$src)),
6448            (VROUNDYPDr VR256:$src, (i32 0x4))>;
6449  def : Pat<(v4f64 (ftrunc VR256:$src)),
6450            (VROUNDYPDr VR256:$src, (i32 0x3))>;
6451}
6452
6453defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
6454                               memopv4f32, memopv2f64,
6455                               int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
6456let Constraints = "$src1 = $dst" in
6457defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
6458                               int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
6459
6460let Predicates = [UseSSE41] in {
6461  def : Pat<(ffloor FR32:$src),
6462            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
6463  def : Pat<(f64 (ffloor FR64:$src)),
6464            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
6465  def : Pat<(f32 (fnearbyint FR32:$src)),
6466            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
6467  def : Pat<(f64 (fnearbyint FR64:$src)),
6468            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
6469  def : Pat<(f32 (fceil FR32:$src)),
6470            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
6471  def : Pat<(f64 (fceil FR64:$src)),
6472            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
6473  def : Pat<(f32 (frint FR32:$src)),
6474            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
6475  def : Pat<(f64 (frint FR64:$src)),
6476            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
6477  def : Pat<(f32 (ftrunc FR32:$src)),
6478            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
6479  def : Pat<(f64 (ftrunc FR64:$src)),
6480            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
6481
6482  def : Pat<(v4f32 (ffloor VR128:$src)),
6483            (ROUNDPSr VR128:$src, (i32 0x1))>;
6484  def : Pat<(v4f32 (fnearbyint VR128:$src)),
6485            (ROUNDPSr VR128:$src, (i32 0xC))>;
6486  def : Pat<(v4f32 (fceil VR128:$src)),
6487            (ROUNDPSr VR128:$src, (i32 0x2))>;
6488  def : Pat<(v4f32 (frint VR128:$src)),
6489            (ROUNDPSr VR128:$src, (i32 0x4))>;
6490  def : Pat<(v4f32 (ftrunc VR128:$src)),
6491            (ROUNDPSr VR128:$src, (i32 0x3))>;
6492
6493  def : Pat<(v2f64 (ffloor VR128:$src)),
6494            (ROUNDPDr VR128:$src, (i32 0x1))>;
6495  def : Pat<(v2f64 (fnearbyint VR128:$src)),
6496            (ROUNDPDr VR128:$src, (i32 0xC))>;
6497  def : Pat<(v2f64 (fceil VR128:$src)),
6498            (ROUNDPDr VR128:$src, (i32 0x2))>;
6499  def : Pat<(v2f64 (frint VR128:$src)),
6500            (ROUNDPDr VR128:$src, (i32 0x4))>;
6501  def : Pat<(v2f64 (ftrunc VR128:$src)),
6502            (ROUNDPDr VR128:$src, (i32 0x3))>;
6503}
6504
6505//===----------------------------------------------------------------------===//
6506// SSE4.1 - Packed Bit Test
6507//===----------------------------------------------------------------------===//
6508
6509// ptest instruction we'll lower to this in X86ISelLowering primarily from
6510// the intel intrinsic that corresponds to this.
6511let Defs = [EFLAGS], Predicates = [HasAVX] in {
6512def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
6513                "vptest\t{$src2, $src1|$src1, $src2}",
6514                [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
6515                OpSize, VEX;
6516def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
6517                "vptest\t{$src2, $src1|$src1, $src2}",
6518                [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
6519                OpSize, VEX;
6520
6521def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
6522                "vptest\t{$src2, $src1|$src1, $src2}",
6523                [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
6524                OpSize, VEX, VEX_L;
6525def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
6526                "vptest\t{$src2, $src1|$src1, $src2}",
6527                [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
6528                OpSize, VEX, VEX_L;
6529}
6530
6531let Defs = [EFLAGS] in {
6532def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
6533              "ptest\t{$src2, $src1|$src1, $src2}",
6534              [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
6535              OpSize;
6536def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
6537              "ptest\t{$src2, $src1|$src1, $src2}",
6538              [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
6539              OpSize;
6540}
6541
6542// The bit test instructions below are AVX only
6543multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
6544                       X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> {
6545  def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
6546            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
6547            [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, OpSize, VEX;
6548  def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
6549            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
6550            [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
6551            OpSize, VEX;
6552}
6553
6554let Defs = [EFLAGS], Predicates = [HasAVX] in {
6555let ExeDomain = SSEPackedSingle in {
6556defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32>;
6557defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32>,
6558                            VEX_L;
6559}
6560let ExeDomain = SSEPackedDouble in {
6561defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64>;
6562defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64>,
6563                            VEX_L;
6564}
6565}
6566
6567//===----------------------------------------------------------------------===//
6568// SSE4.1 - Misc Instructions
6569//===----------------------------------------------------------------------===//
6570
6571let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
6572  def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
6573                     "popcnt{w}\t{$src, $dst|$dst, $src}",
6574                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)],
6575                     IIC_SSE_POPCNT_RR>,
6576                     OpSize, XS;
6577  def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
6578                     "popcnt{w}\t{$src, $dst|$dst, $src}",
6579                     [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
6580                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, OpSize, XS;
6581
6582  def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
6583                     "popcnt{l}\t{$src, $dst|$dst, $src}",
6584                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)],
6585                     IIC_SSE_POPCNT_RR>,
6586                     XS;
6587  def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
6588                     "popcnt{l}\t{$src, $dst|$dst, $src}",
6589                     [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
6590                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, XS;
6591
6592  def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
6593                      "popcnt{q}\t{$src, $dst|$dst, $src}",
6594                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)],
6595                      IIC_SSE_POPCNT_RR>,
6596                      XS;
6597  def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
6598                      "popcnt{q}\t{$src, $dst|$dst, $src}",
6599                      [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
6600                       (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, XS;
6601}
6602
6603
6604
6605// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
6606multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
6607                                 Intrinsic IntId128> {
6608  def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6609                    (ins VR128:$src),
6610                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6611                    [(set VR128:$dst, (IntId128 VR128:$src))]>, OpSize;
6612  def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6613                     (ins i128mem:$src),
6614                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6615                     [(set VR128:$dst,
6616                       (IntId128
6617                        (bitconvert (memopv2i64 addr:$src))))]>, OpSize;
6618}
6619
6620let Predicates = [HasAVX] in
6621defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
6622                                         int_x86_sse41_phminposuw>, VEX;
6623defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
6624                                         int_x86_sse41_phminposuw>;
6625
6626/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
6627multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
6628                              Intrinsic IntId128, bit Is2Addr = 1,
6629                              OpndItins itins = DEFAULT_ITINS> {
6630  let isCommutable = 1 in
6631  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6632       (ins VR128:$src1, VR128:$src2),
6633       !if(Is2Addr,
6634           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6635           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6636       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))],
6637        itins.rr>, OpSize;
6638  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6639       (ins VR128:$src1, i128mem:$src2),
6640       !if(Is2Addr,
6641           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6642           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6643       [(set VR128:$dst,
6644         (IntId128 VR128:$src1,
6645          (bitconvert (memopv2i64 addr:$src2))))],
6646          itins.rm>, OpSize;
6647}
6648
6649/// SS41I_binop_rm_int_y - Simple SSE 4.1 binary operator
6650multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
6651                                Intrinsic IntId256> {
6652  let isCommutable = 1 in
6653  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst),
6654       (ins VR256:$src1, VR256:$src2),
6655       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6656       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, OpSize;
6657  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst),
6658       (ins VR256:$src1, i256mem:$src2),
6659       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6660       [(set VR256:$dst,
6661         (IntId256 VR256:$src1,
6662          (bitconvert (loadv4i64 addr:$src2))))]>, OpSize;
6663}
6664
6665
6666/// SS48I_binop_rm - Simple SSE41 binary operator.
6667multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6668                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6669                          X86MemOperand x86memop, bit Is2Addr = 1,
6670                          OpndItins itins = DEFAULT_ITINS> {
6671  let isCommutable = 1 in
6672  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
6673       (ins RC:$src1, RC:$src2),
6674       !if(Is2Addr,
6675           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6676           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6677       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, OpSize;
6678  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
6679       (ins RC:$src1, x86memop:$src2),
6680       !if(Is2Addr,
6681           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6682           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6683       [(set RC:$dst,
6684         (OpVT (OpNode RC:$src1,
6685          (bitconvert (memop_frag addr:$src2)))))]>, OpSize;
6686}
6687
6688let Predicates = [HasAVX] in {
6689  let isCommutable = 0 in
6690  defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw,
6691                                                         0>, VEX_4V;
6692  defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128,
6693                                  loadv2i64, i128mem, 0>, VEX_4V;
6694  defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", X86smin, v4i32, VR128,
6695                                  loadv2i64, i128mem, 0>, VEX_4V;
6696  defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", X86umin, v4i32, VR128,
6697                                  loadv2i64, i128mem, 0>, VEX_4V;
6698  defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v8i16, VR128,
6699                                  loadv2i64, i128mem, 0>, VEX_4V;
6700  defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v16i8, VR128,
6701                                  loadv2i64, i128mem, 0>, VEX_4V;
6702  defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v4i32, VR128,
6703                                  loadv2i64, i128mem, 0>, VEX_4V;
6704  defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v4i32, VR128,
6705                                  loadv2i64, i128mem, 0>, VEX_4V;
6706  defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128,
6707                                  loadv2i64, i128mem, 0>, VEX_4V;
6708  defm VPMULDQ   : SS41I_binop_rm_int<0x28, "vpmuldq",   int_x86_sse41_pmuldq,
6709                                                         0>, VEX_4V;
6710}
6711
6712let Predicates = [HasAVX2] in {
6713  let isCommutable = 0 in
6714  defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw",
6715                                        int_x86_avx2_packusdw>, VEX_4V, VEX_L;
6716  defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256,
6717                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
6718  defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", X86smin, v8i32, VR256,
6719                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
6720  defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", X86umin, v8i32, VR256,
6721                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
6722  defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v16i16, VR256,
6723                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
6724  defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v32i8, VR256,
6725                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
6726  defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v8i32, VR256,
6727                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
6728  defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v8i32, VR256,
6729                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
6730  defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256,
6731                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
6732  defm VPMULDQ   : SS41I_binop_rm_int_y<0x28, "vpmuldq",
6733                                        int_x86_avx2_pmul_dq>, VEX_4V, VEX_L;
6734}
6735
6736let Constraints = "$src1 = $dst" in {
6737  let isCommutable = 0 in
6738  defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw>;
6739  defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128,
6740                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6741  defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128,
6742                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6743  defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", X86umin, v4i32, VR128,
6744                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6745  defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", X86umin, v8i16, VR128,
6746                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6747  defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", X86smax, v16i8, VR128,
6748                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6749  defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", X86smax, v4i32, VR128,
6750                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6751  defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", X86umax, v4i32, VR128,
6752                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6753  defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128,
6754                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6755  defm PMULDQ   : SS41I_binop_rm_int<0x28, "pmuldq",   int_x86_sse41_pmuldq,
6756                                     1, SSE_INTMUL_ITINS_P>;
6757}
6758
6759let Predicates = [HasAVX] in {
6760  defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
6761                                memopv2i64, i128mem, 0>, VEX_4V;
6762  defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
6763                                 memopv2i64, i128mem, 0>, VEX_4V;
6764}
6765let Predicates = [HasAVX2] in {
6766  defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
6767                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
6768  defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
6769                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
6770}
6771
6772let Constraints = "$src1 = $dst" in {
6773  defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
6774                                memopv2i64, i128mem, 1, SSE_PMULLD_ITINS>;
6775  defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
6776                                memopv2i64, i128mem, 1, SSE_INTALUQ_ITINS_P>;
6777}
6778
6779/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
6780multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
6781                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
6782                 X86MemOperand x86memop, bit Is2Addr = 1,
6783                 OpndItins itins = DEFAULT_ITINS> {
6784  let isCommutable = 1 in
6785  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6786        (ins RC:$src1, RC:$src2, u32u8imm:$src3),
6787        !if(Is2Addr,
6788            !strconcat(OpcodeStr,
6789                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6790            !strconcat(OpcodeStr,
6791                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6792        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>,
6793        OpSize;
6794  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6795        (ins RC:$src1, x86memop:$src2, u32u8imm:$src3),
6796        !if(Is2Addr,
6797            !strconcat(OpcodeStr,
6798                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6799            !strconcat(OpcodeStr,
6800                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6801        [(set RC:$dst,
6802          (IntId RC:$src1,
6803           (bitconvert (memop_frag addr:$src2)), imm:$src3))], itins.rm>,
6804        OpSize;
6805}
6806
6807let Predicates = [HasAVX] in {
6808  let isCommutable = 0 in {
6809    let ExeDomain = SSEPackedSingle in {
6810    defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
6811                                        VR128, loadv4f32, f128mem, 0>, VEX_4V;
6812    defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
6813                                    int_x86_avx_blend_ps_256, VR256, loadv8f32,
6814                                    f256mem, 0>, VEX_4V, VEX_L;
6815    }
6816    let ExeDomain = SSEPackedDouble in {
6817    defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
6818                                        VR128, loadv2f64, f128mem, 0>, VEX_4V;
6819    defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
6820                                     int_x86_avx_blend_pd_256,VR256, loadv4f64,
6821                                     f256mem, 0>, VEX_4V, VEX_L;
6822    }
6823  defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
6824                                      VR128, loadv2i64, i128mem, 0>, VEX_4V;
6825  defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
6826                                      VR128, loadv2i64, i128mem, 0>, VEX_4V;
6827  }
6828  let ExeDomain = SSEPackedSingle in
6829  defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
6830                                   VR128, loadv4f32, f128mem, 0>, VEX_4V;
6831  let ExeDomain = SSEPackedDouble in
6832  defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
6833                                   VR128, loadv2f64, f128mem, 0>, VEX_4V;
6834  let ExeDomain = SSEPackedSingle in
6835  defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
6836                                  VR256, loadv8f32, i256mem, 0>, VEX_4V, VEX_L;
6837}
6838
6839let Predicates = [HasAVX2] in {
6840  let isCommutable = 0 in {
6841  defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw,
6842                                  VR256, loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
6843  defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
6844                                  VR256, loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
6845  }
6846}
6847
6848let Constraints = "$src1 = $dst" in {
6849  let isCommutable = 0 in {
6850  let ExeDomain = SSEPackedSingle in
6851  defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps,
6852                                     VR128, memopv4f32, f128mem,
6853                                     1, SSE_INTALU_ITINS_P>;
6854  let ExeDomain = SSEPackedDouble in
6855  defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd,
6856                                     VR128, memopv2f64, f128mem,
6857                                     1, SSE_INTALU_ITINS_P>;
6858  defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw,
6859                                     VR128, memopv2i64, i128mem,
6860                                     1, SSE_INTALU_ITINS_P>;
6861  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
6862                                     VR128, memopv2i64, i128mem,
6863                                     1, SSE_INTMUL_ITINS_P>;
6864  }
6865  let ExeDomain = SSEPackedSingle in
6866  defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
6867                                  VR128, memopv4f32, f128mem, 1,
6868                                  SSE_DPPS_ITINS>;
6869  let ExeDomain = SSEPackedDouble in
6870  defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
6871                                  VR128, memopv2f64, f128mem, 1,
6872                                  SSE_DPPD_ITINS>;
6873}
6874
6875/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
6876multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
6877                                    RegisterClass RC, X86MemOperand x86memop,
6878                                    PatFrag mem_frag, Intrinsic IntId> {
6879  def rr : Ii8<opc, MRMSrcReg, (outs RC:$dst),
6880                  (ins RC:$src1, RC:$src2, RC:$src3),
6881                  !strconcat(OpcodeStr,
6882                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6883                  [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
6884                  NoItinerary, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
6885
6886  def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst),
6887                  (ins RC:$src1, x86memop:$src2, RC:$src3),
6888                  !strconcat(OpcodeStr,
6889                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6890                  [(set RC:$dst,
6891                        (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
6892                               RC:$src3))],
6893                  NoItinerary, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
6894}
6895
6896let Predicates = [HasAVX] in {
6897let ExeDomain = SSEPackedDouble in {
6898defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
6899                                           loadv2f64, int_x86_sse41_blendvpd>;
6900defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
6901                                  loadv4f64, int_x86_avx_blendv_pd_256>, VEX_L;
6902} // ExeDomain = SSEPackedDouble
6903let ExeDomain = SSEPackedSingle in {
6904defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
6905                                           loadv4f32, int_x86_sse41_blendvps>;
6906defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
6907                                  loadv8f32, int_x86_avx_blendv_ps_256>, VEX_L;
6908} // ExeDomain = SSEPackedSingle
6909defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
6910                                           loadv2i64, int_x86_sse41_pblendvb>;
6911}
6912
6913let Predicates = [HasAVX2] in {
6914defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
6915                                      loadv4i64, int_x86_avx2_pblendvb>, VEX_L;
6916}
6917
6918let Predicates = [HasAVX] in {
6919  def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
6920                            (v16i8 VR128:$src2))),
6921            (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6922  def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
6923                            (v4i32 VR128:$src2))),
6924            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6925  def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
6926                            (v4f32 VR128:$src2))),
6927            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6928  def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
6929                            (v2i64 VR128:$src2))),
6930            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6931  def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
6932                            (v2f64 VR128:$src2))),
6933            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6934  def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
6935                            (v8i32 VR256:$src2))),
6936            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6937  def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
6938                            (v8f32 VR256:$src2))),
6939            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6940  def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
6941                            (v4i64 VR256:$src2))),
6942            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6943  def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
6944                            (v4f64 VR256:$src2))),
6945            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6946
6947  def : Pat<(v8f32 (X86Blendi (v8f32 VR256:$src1), (v8f32 VR256:$src2),
6948                               (imm:$mask))),
6949            (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$mask)>;
6950  def : Pat<(v4f64 (X86Blendi (v4f64 VR256:$src1), (v4f64 VR256:$src2),
6951                               (imm:$mask))),
6952            (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$mask)>;
6953
6954  def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2),
6955                               (imm:$mask))),
6956            (VPBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>;
6957  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2),
6958                               (imm:$mask))),
6959            (VBLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>;
6960  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2),
6961                               (imm:$mask))),
6962            (VBLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>;
6963}
6964
6965let Predicates = [HasAVX2] in {
6966  def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
6967                            (v32i8 VR256:$src2))),
6968            (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6969  def : Pat<(v16i16 (X86Blendi (v16i16 VR256:$src1), (v16i16 VR256:$src2),
6970                               (imm:$mask))),
6971            (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>;
6972}
6973
6974/// SS41I_ternary_int - SSE 4.1 ternary operator
6975let Uses = [XMM0], Constraints = "$src1 = $dst" in {
6976  multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
6977                               X86MemOperand x86memop, Intrinsic IntId,
6978                               OpndItins itins = DEFAULT_ITINS> {
6979    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6980                    (ins VR128:$src1, VR128:$src2),
6981                    !strconcat(OpcodeStr,
6982                     "\t{$src2, $dst|$dst, $src2}"),
6983                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))],
6984                    itins.rr>, OpSize;
6985
6986    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6987                    (ins VR128:$src1, x86memop:$src2),
6988                    !strconcat(OpcodeStr,
6989                     "\t{$src2, $dst|$dst, $src2}"),
6990                    [(set VR128:$dst,
6991                      (IntId VR128:$src1,
6992                       (bitconvert (mem_frag addr:$src2)), XMM0))],
6993                       itins.rm>, OpSize;
6994  }
6995}
6996
6997let ExeDomain = SSEPackedDouble in
6998defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
6999                                  int_x86_sse41_blendvpd>;
7000let ExeDomain = SSEPackedSingle in
7001defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
7002                                  int_x86_sse41_blendvps>;
7003defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
7004                                  int_x86_sse41_pblendvb>;
7005
7006// Aliases with the implicit xmm0 argument
7007def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7008                (BLENDVPDrr0 VR128:$dst, VR128:$src2)>;
7009def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7010                (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>;
7011def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7012                (BLENDVPSrr0 VR128:$dst, VR128:$src2)>;
7013def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7014                (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>;
7015def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7016                (PBLENDVBrr0 VR128:$dst, VR128:$src2)>;
7017def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7018                (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
7019
7020let Predicates = [UseSSE41] in {
7021  def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
7022                            (v16i8 VR128:$src2))),
7023            (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
7024  def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
7025                            (v4i32 VR128:$src2))),
7026            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
7027  def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
7028                            (v4f32 VR128:$src2))),
7029            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
7030  def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
7031                            (v2i64 VR128:$src2))),
7032            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
7033  def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
7034                            (v2f64 VR128:$src2))),
7035            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
7036
7037  def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2),
7038                               (imm:$mask))),
7039            (PBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>;
7040  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2),
7041                               (imm:$mask))),
7042            (BLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>;
7043  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2),
7044                               (imm:$mask))),
7045            (BLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>;
7046
7047}
7048
7049let Predicates = [HasAVX] in
7050def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
7051                       "vmovntdqa\t{$src, $dst|$dst, $src}",
7052                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
7053                       OpSize, VEX;
7054let Predicates = [HasAVX2] in
7055def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
7056                         "vmovntdqa\t{$src, $dst|$dst, $src}",
7057                         [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
7058                         OpSize, VEX, VEX_L;
7059def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
7060                       "movntdqa\t{$src, $dst|$dst, $src}",
7061                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
7062                       OpSize;
7063
7064//===----------------------------------------------------------------------===//
7065// SSE4.2 - Compare Instructions
7066//===----------------------------------------------------------------------===//
7067
7068/// SS42I_binop_rm - Simple SSE 4.2 binary operator
7069multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
7070                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
7071                          X86MemOperand x86memop, bit Is2Addr = 1> {
7072  def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
7073       (ins RC:$src1, RC:$src2),
7074       !if(Is2Addr,
7075           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7076           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7077       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
7078       OpSize;
7079  def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
7080       (ins RC:$src1, x86memop:$src2),
7081       !if(Is2Addr,
7082           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7083           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7084       [(set RC:$dst,
7085         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, OpSize;
7086}
7087
7088let Predicates = [HasAVX] in
7089  defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
7090                                 loadv2i64, i128mem, 0>, VEX_4V;
7091
7092let Predicates = [HasAVX2] in
7093  defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
7094                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
7095
7096let Constraints = "$src1 = $dst" in
7097  defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
7098                                memopv2i64, i128mem>;
7099
7100//===----------------------------------------------------------------------===//
7101// SSE4.2 - String/text Processing Instructions
7102//===----------------------------------------------------------------------===//
7103
7104// Packed Compare Implicit Length Strings, Return Mask
7105multiclass pseudo_pcmpistrm<string asm> {
7106  def REG : PseudoI<(outs VR128:$dst),
7107                    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
7108    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
7109                                                  imm:$src3))]>;
7110  def MEM : PseudoI<(outs VR128:$dst),
7111                    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
7112    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
7113                       (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>;
7114}
7115
7116let Defs = [EFLAGS], usesCustomInserter = 1 in {
7117  defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>;
7118  defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[UseSSE42]>;
7119}
7120
7121multiclass pcmpistrm_SS42AI<string asm> {
7122  def rr : SS42AI<0x62, MRMSrcReg, (outs),
7123    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
7124    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7125    []>, OpSize;
7126  let mayLoad = 1 in
7127  def rm :SS42AI<0x62, MRMSrcMem, (outs),
7128    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
7129    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7130    []>, OpSize;
7131}
7132
7133let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1 in {
7134  let Predicates = [HasAVX] in
7135  defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
7136  defm PCMPISTRM128  : pcmpistrm_SS42AI<"pcmpistrm"> ;
7137}
7138
7139// Packed Compare Explicit Length Strings, Return Mask
7140multiclass pseudo_pcmpestrm<string asm> {
7141  def REG : PseudoI<(outs VR128:$dst),
7142                    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
7143    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
7144                       VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
7145  def MEM : PseudoI<(outs VR128:$dst),
7146                    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
7147    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
7148                       (bc_v16i8 (memopv2i64 addr:$src3)), EDX, imm:$src5))]>;
7149}
7150
7151let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
7152  defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>;
7153  defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[UseSSE42]>;
7154}
7155
7156multiclass SS42AI_pcmpestrm<string asm> {
7157  def rr : SS42AI<0x60, MRMSrcReg, (outs),
7158    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
7159    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7160    []>, OpSize;
7161  let mayLoad = 1 in
7162  def rm : SS42AI<0x60, MRMSrcMem, (outs),
7163    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
7164    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7165    []>, OpSize;
7166}
7167
7168let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
7169  let Predicates = [HasAVX] in
7170  defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
7171  defm PCMPESTRM128 :  SS42AI_pcmpestrm<"pcmpestrm">;
7172}
7173
7174// Packed Compare Implicit Length Strings, Return Index
7175multiclass pseudo_pcmpistri<string asm> {
7176  def REG : PseudoI<(outs GR32:$dst),
7177                    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
7178    [(set GR32:$dst, EFLAGS,
7179      (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
7180  def MEM : PseudoI<(outs GR32:$dst),
7181                    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
7182    [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
7183                              (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>;
7184}
7185
7186let Defs = [EFLAGS], usesCustomInserter = 1 in {
7187  defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI">, Requires<[HasAVX]>;
7188  defm PCMPISTRI  : pseudo_pcmpistri<"#PCMPISTRI">, Requires<[UseSSE42]>;
7189}
7190
7191multiclass SS42AI_pcmpistri<string asm> {
7192  def rr : SS42AI<0x63, MRMSrcReg, (outs),
7193    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
7194    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7195    []>, OpSize;
7196  let mayLoad = 1 in
7197  def rm : SS42AI<0x63, MRMSrcMem, (outs),
7198    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
7199    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7200    []>, OpSize;
7201}
7202
7203let Defs = [ECX, EFLAGS], neverHasSideEffects = 1 in {
7204  let Predicates = [HasAVX] in
7205  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
7206  defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
7207}
7208
7209// Packed Compare Explicit Length Strings, Return Index
7210multiclass pseudo_pcmpestri<string asm> {
7211  def REG : PseudoI<(outs GR32:$dst),
7212                    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
7213    [(set GR32:$dst, EFLAGS,
7214      (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
7215  def MEM : PseudoI<(outs GR32:$dst),
7216                    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
7217    [(set GR32:$dst, EFLAGS,
7218      (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (memopv2i64 addr:$src3)), EDX,
7219       imm:$src5))]>;
7220}
7221
7222let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
7223  defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI">, Requires<[HasAVX]>;
7224  defm PCMPESTRI  : pseudo_pcmpestri<"#PCMPESTRI">, Requires<[UseSSE42]>;
7225}
7226
7227multiclass SS42AI_pcmpestri<string asm> {
7228  def rr : SS42AI<0x61, MRMSrcReg, (outs),
7229    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
7230    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7231    []>, OpSize;
7232  let mayLoad = 1 in
7233  def rm : SS42AI<0x61, MRMSrcMem, (outs),
7234    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
7235    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7236    []>, OpSize;
7237}
7238
7239let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
7240  let Predicates = [HasAVX] in
7241  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
7242  defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
7243}
7244
7245//===----------------------------------------------------------------------===//
7246// SSE4.2 - CRC Instructions
7247//===----------------------------------------------------------------------===//
7248
7249// No CRC instructions have AVX equivalents
7250
7251// crc intrinsic instruction
7252// This set of instructions are only rm, the only difference is the size
7253// of r and m.
7254class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
7255                   RegisterClass RCIn, SDPatternOperator Int> :
7256  SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
7257         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
7258         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>;
7259
7260class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
7261                   X86MemOperand x86memop, SDPatternOperator Int> :
7262  SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
7263         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
7264         [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))],
7265         IIC_CRC32_MEM>;
7266
7267let Constraints = "$src1 = $dst" in {
7268  def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
7269                                 int_x86_sse42_crc32_32_8>;
7270  def CRC32r32r8  : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
7271                                 int_x86_sse42_crc32_32_8>;
7272  def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
7273                                 int_x86_sse42_crc32_32_16>, OpSize;
7274  def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
7275                                 int_x86_sse42_crc32_32_16>, OpSize;
7276  def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
7277                                 int_x86_sse42_crc32_32_32>;
7278  def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
7279                                 int_x86_sse42_crc32_32_32>;
7280  def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
7281                                 int_x86_sse42_crc32_64_64>, REX_W;
7282  def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
7283                                 int_x86_sse42_crc32_64_64>, REX_W;
7284  let hasSideEffects = 0 in {
7285    let mayLoad = 1 in
7286    def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
7287                                   null_frag>, REX_W;
7288    def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
7289                                   null_frag>, REX_W;
7290  }
7291}
7292
7293//===----------------------------------------------------------------------===//
7294// SHA-NI Instructions
7295//===----------------------------------------------------------------------===//
7296
7297multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
7298                      bit UsesXMM0 = 0> {
7299  def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
7300             (ins VR128:$src1, VR128:$src2),
7301             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7302             [!if(UsesXMM0,
7303                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
7304                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8;
7305
7306  def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
7307             (ins VR128:$src1, i128mem:$src2),
7308             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7309             [!if(UsesXMM0,
7310                  (set VR128:$dst, (IntId VR128:$src1,
7311                    (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
7312                  (set VR128:$dst, (IntId VR128:$src1,
7313                    (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8;
7314}
7315
7316let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
7317  def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
7318                         (ins VR128:$src1, VR128:$src2, i8imm:$src3),
7319                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7320                         [(set VR128:$dst,
7321                           (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
7322                            (i8 imm:$src3)))]>, TA;
7323  def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
7324                         (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
7325                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7326                         [(set VR128:$dst,
7327                           (int_x86_sha1rnds4 VR128:$src1,
7328                            (bc_v4i32 (memopv2i64 addr:$src2)),
7329                            (i8 imm:$src3)))]>, TA;
7330
7331  defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte>;
7332  defm SHA1MSG1  : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1>;
7333  defm SHA1MSG2  : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2>;
7334
7335  let Uses=[XMM0] in
7336  defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 1>;
7337
7338  defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1>;
7339  defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2>;
7340}
7341
7342// Aliases with explicit %xmm0
7343def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7344                (SHA256RNDS2rr VR128:$dst, VR128:$src2)>;
7345def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7346                (SHA256RNDS2rm VR128:$dst, i128mem:$src2)>;
7347
7348//===----------------------------------------------------------------------===//
7349// AES-NI Instructions
7350//===----------------------------------------------------------------------===//
7351
7352multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
7353                              Intrinsic IntId128, bit Is2Addr = 1> {
7354  def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst),
7355       (ins VR128:$src1, VR128:$src2),
7356       !if(Is2Addr,
7357           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7358           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7359       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
7360       OpSize;
7361  def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst),
7362       (ins VR128:$src1, i128mem:$src2),
7363       !if(Is2Addr,
7364           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7365           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7366       [(set VR128:$dst,
7367         (IntId128 VR128:$src1, (memopv2i64 addr:$src2)))]>, OpSize;
7368}
7369
7370// Perform One Round of an AES Encryption/Decryption Flow
7371let Predicates = [HasAVX, HasAES] in {
7372  defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
7373                         int_x86_aesni_aesenc, 0>, VEX_4V;
7374  defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
7375                         int_x86_aesni_aesenclast, 0>, VEX_4V;
7376  defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
7377                         int_x86_aesni_aesdec, 0>, VEX_4V;
7378  defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
7379                         int_x86_aesni_aesdeclast, 0>, VEX_4V;
7380}
7381
7382let Constraints = "$src1 = $dst" in {
7383  defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
7384                         int_x86_aesni_aesenc>;
7385  defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
7386                         int_x86_aesni_aesenclast>;
7387  defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
7388                         int_x86_aesni_aesdec>;
7389  defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
7390                         int_x86_aesni_aesdeclast>;
7391}
7392
7393// Perform the AES InvMixColumn Transformation
7394let Predicates = [HasAVX, HasAES] in {
7395  def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
7396      (ins VR128:$src1),
7397      "vaesimc\t{$src1, $dst|$dst, $src1}",
7398      [(set VR128:$dst,
7399        (int_x86_aesni_aesimc VR128:$src1))]>,
7400      OpSize, VEX;
7401  def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
7402      (ins i128mem:$src1),
7403      "vaesimc\t{$src1, $dst|$dst, $src1}",
7404      [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
7405      OpSize, VEX;
7406}
7407def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
7408  (ins VR128:$src1),
7409  "aesimc\t{$src1, $dst|$dst, $src1}",
7410  [(set VR128:$dst,
7411    (int_x86_aesni_aesimc VR128:$src1))]>,
7412  OpSize;
7413def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
7414  (ins i128mem:$src1),
7415  "aesimc\t{$src1, $dst|$dst, $src1}",
7416  [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
7417  OpSize;
7418
7419// AES Round Key Generation Assist
7420let Predicates = [HasAVX, HasAES] in {
7421  def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
7422      (ins VR128:$src1, i8imm:$src2),
7423      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7424      [(set VR128:$dst,
7425        (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
7426      OpSize, VEX;
7427  def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
7428      (ins i128mem:$src1, i8imm:$src2),
7429      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7430      [(set VR128:$dst,
7431        (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
7432      OpSize, VEX;
7433}
7434def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
7435  (ins VR128:$src1, i8imm:$src2),
7436  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7437  [(set VR128:$dst,
7438    (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
7439  OpSize;
7440def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
7441  (ins i128mem:$src1, i8imm:$src2),
7442  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7443  [(set VR128:$dst,
7444    (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
7445  OpSize;
7446
7447//===----------------------------------------------------------------------===//
7448// PCLMUL Instructions
7449//===----------------------------------------------------------------------===//
7450
7451// AVX carry-less Multiplication instructions
7452def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
7453           (ins VR128:$src1, VR128:$src2, i8imm:$src3),
7454           "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7455           [(set VR128:$dst,
7456             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>;
7457
7458def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
7459           (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
7460           "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7461           [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
7462                              (loadv2i64 addr:$src2), imm:$src3))]>;
7463
7464// Carry-less Multiplication instructions
7465let Constraints = "$src1 = $dst" in {
7466def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
7467           (ins VR128:$src1, VR128:$src2, i8imm:$src3),
7468           "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7469           [(set VR128:$dst,
7470             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))],
7471             IIC_SSE_PCLMULQDQ_RR>;
7472
7473def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
7474           (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
7475           "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7476           [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
7477                              (memopv2i64 addr:$src2), imm:$src3))],
7478                              IIC_SSE_PCLMULQDQ_RM>;
7479} // Constraints = "$src1 = $dst"
7480
7481
7482multiclass pclmul_alias<string asm, int immop> {
7483  def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
7484                  (PCLMULQDQrr VR128:$dst, VR128:$src, immop)>;
7485
7486  def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
7487                  (PCLMULQDQrm VR128:$dst, i128mem:$src, immop)>;
7488
7489  def : InstAlias<!strconcat("vpclmul", asm,
7490                             "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
7491                  (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop)>;
7492
7493  def : InstAlias<!strconcat("vpclmul", asm,
7494                             "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
7495                  (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop)>;
7496}
7497defm : pclmul_alias<"hqhq", 0x11>;
7498defm : pclmul_alias<"hqlq", 0x01>;
7499defm : pclmul_alias<"lqhq", 0x10>;
7500defm : pclmul_alias<"lqlq", 0x00>;
7501
7502//===----------------------------------------------------------------------===//
7503// SSE4A Instructions
7504//===----------------------------------------------------------------------===//
7505
7506let Predicates = [HasSSE4A] in {
7507
7508let Constraints = "$src = $dst" in {
7509def EXTRQI : Ii8<0x78, MRM0r, (outs VR128:$dst),
7510                 (ins VR128:$src, i8imm:$len, i8imm:$idx),
7511                 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
7512                 [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len,
7513                                    imm:$idx))]>, TB, OpSize;
7514def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
7515              (ins VR128:$src, VR128:$mask),
7516              "extrq\t{$mask, $src|$src, $mask}",
7517              [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
7518                                 VR128:$mask))]>, TB, OpSize;
7519
7520def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
7521                   (ins VR128:$src, VR128:$src2, i8imm:$len, i8imm:$idx),
7522                   "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
7523                   [(set VR128:$dst, (int_x86_sse4a_insertqi VR128:$src,
7524                                      VR128:$src2, imm:$len, imm:$idx))]>, XD;
7525def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
7526                 (ins VR128:$src, VR128:$mask),
7527                 "insertq\t{$mask, $src|$src, $mask}",
7528                 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
7529                                    VR128:$mask))]>, XD;
7530}
7531
7532def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
7533                "movntss\t{$src, $dst|$dst, $src}",
7534                [(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS;
7535
7536def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
7537                "movntsd\t{$src, $dst|$dst, $src}",
7538                [(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD;
7539}
7540
7541//===----------------------------------------------------------------------===//
7542// AVX Instructions
7543//===----------------------------------------------------------------------===//
7544
7545//===----------------------------------------------------------------------===//
7546// VBROADCAST - Load from memory and broadcast to all elements of the
7547//              destination operand
7548//
7549class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC,
7550                    X86MemOperand x86memop, Intrinsic Int> :
7551  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7552        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7553        [(set RC:$dst, (Int addr:$src))]>, VEX;
7554
7555// AVX2 adds register forms
7556class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC,
7557                         Intrinsic Int> :
7558  AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7559         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7560         [(set RC:$dst, (Int VR128:$src))]>, VEX;
7561
7562let ExeDomain = SSEPackedSingle in {
7563  def VBROADCASTSSrm  : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem,
7564                                      int_x86_avx_vbroadcast_ss>;
7565  def VBROADCASTSSYrm : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem,
7566                                      int_x86_avx_vbroadcast_ss_256>, VEX_L;
7567}
7568let ExeDomain = SSEPackedDouble in
7569def VBROADCASTSDYrm  : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
7570                                    int_x86_avx_vbroadcast_sd_256>, VEX_L;
7571def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
7572                                   int_x86_avx_vbroadcastf128_pd_256>, VEX_L;
7573
7574let ExeDomain = SSEPackedSingle in {
7575  def VBROADCASTSSrr  : avx2_broadcast_reg<0x18, "vbroadcastss", VR128,
7576                                           int_x86_avx2_vbroadcast_ss_ps>;
7577  def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256,
7578                                      int_x86_avx2_vbroadcast_ss_ps_256>, VEX_L;
7579}
7580let ExeDomain = SSEPackedDouble in
7581def VBROADCASTSDYrr  : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
7582                                      int_x86_avx2_vbroadcast_sd_pd_256>, VEX_L;
7583
7584let Predicates = [HasAVX2] in
7585def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem,
7586                                   int_x86_avx2_vbroadcasti128>, VEX_L;
7587
7588let Predicates = [HasAVX] in
7589def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
7590          (VBROADCASTF128 addr:$src)>;
7591
7592
7593//===----------------------------------------------------------------------===//
7594// VINSERTF128 - Insert packed floating-point values
7595//
7596let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
7597def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
7598          (ins VR256:$src1, VR128:$src2, i8imm:$src3),
7599          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7600          []>, VEX_4V, VEX_L;
7601let mayLoad = 1 in
7602def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
7603          (ins VR256:$src1, f128mem:$src2, i8imm:$src3),
7604          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7605          []>, VEX_4V, VEX_L;
7606}
7607
7608let Predicates = [HasAVX] in {
7609def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
7610                                   (iPTR imm)),
7611          (VINSERTF128rr VR256:$src1, VR128:$src2,
7612                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7613def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
7614                                   (iPTR imm)),
7615          (VINSERTF128rr VR256:$src1, VR128:$src2,
7616                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7617
7618def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2),
7619                                   (iPTR imm)),
7620          (VINSERTF128rm VR256:$src1, addr:$src2,
7621                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7622def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),
7623                                   (iPTR imm)),
7624          (VINSERTF128rm VR256:$src1, addr:$src2,
7625                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7626}
7627
7628let Predicates = [HasAVX1Only] in {
7629def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
7630                                   (iPTR imm)),
7631          (VINSERTF128rr VR256:$src1, VR128:$src2,
7632                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7633def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
7634                                   (iPTR imm)),
7635          (VINSERTF128rr VR256:$src1, VR128:$src2,
7636                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7637def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
7638                                   (iPTR imm)),
7639          (VINSERTF128rr VR256:$src1, VR128:$src2,
7640                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7641def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
7642                                   (iPTR imm)),
7643          (VINSERTF128rr VR256:$src1, VR128:$src2,
7644                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7645
7646def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
7647                                   (iPTR imm)),
7648          (VINSERTF128rm VR256:$src1, addr:$src2,
7649                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7650def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
7651                                   (bc_v4i32 (loadv2i64 addr:$src2)),
7652                                   (iPTR imm)),
7653          (VINSERTF128rm VR256:$src1, addr:$src2,
7654                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7655def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
7656                                   (bc_v16i8 (loadv2i64 addr:$src2)),
7657                                   (iPTR imm)),
7658          (VINSERTF128rm VR256:$src1, addr:$src2,
7659                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7660def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
7661                                   (bc_v8i16 (loadv2i64 addr:$src2)),
7662                                   (iPTR imm)),
7663          (VINSERTF128rm VR256:$src1, addr:$src2,
7664                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7665}
7666
7667//===----------------------------------------------------------------------===//
7668// VEXTRACTF128 - Extract packed floating-point values
7669//
7670let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
7671def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
7672          (ins VR256:$src1, i8imm:$src2),
7673          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7674          []>, VEX, VEX_L;
7675let mayStore = 1 in
7676def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
7677          (ins f128mem:$dst, VR256:$src1, i8imm:$src2),
7678          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7679          []>, VEX, VEX_L;
7680}
7681
7682// AVX1 patterns
7683let Predicates = [HasAVX] in {
7684def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7685          (v4f32 (VEXTRACTF128rr
7686                    (v8f32 VR256:$src1),
7687                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7688def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7689          (v2f64 (VEXTRACTF128rr
7690                    (v4f64 VR256:$src1),
7691                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7692
7693def : Pat<(store (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1),
7694                         (iPTR imm))), addr:$dst),
7695          (VEXTRACTF128mr addr:$dst, VR256:$src1,
7696           (EXTRACT_get_vextract128_imm VR128:$ext))>;
7697def : Pat<(store (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1),
7698                         (iPTR imm))), addr:$dst),
7699          (VEXTRACTF128mr addr:$dst, VR256:$src1,
7700           (EXTRACT_get_vextract128_imm VR128:$ext))>;
7701}
7702
7703let Predicates = [HasAVX1Only] in {
7704def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7705          (v2i64 (VEXTRACTF128rr
7706                  (v4i64 VR256:$src1),
7707                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7708def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7709          (v4i32 (VEXTRACTF128rr
7710                  (v8i32 VR256:$src1),
7711                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7712def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7713          (v8i16 (VEXTRACTF128rr
7714                  (v16i16 VR256:$src1),
7715                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7716def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7717          (v16i8 (VEXTRACTF128rr
7718                  (v32i8 VR256:$src1),
7719                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7720
7721def : Pat<(alignedstore (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
7722                                (iPTR imm))), addr:$dst),
7723          (VEXTRACTF128mr addr:$dst, VR256:$src1,
7724           (EXTRACT_get_vextract128_imm VR128:$ext))>;
7725def : Pat<(alignedstore (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
7726                                (iPTR imm))), addr:$dst),
7727          (VEXTRACTF128mr addr:$dst, VR256:$src1,
7728           (EXTRACT_get_vextract128_imm VR128:$ext))>;
7729def : Pat<(alignedstore (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
7730                                (iPTR imm))), addr:$dst),
7731          (VEXTRACTF128mr addr:$dst, VR256:$src1,
7732           (EXTRACT_get_vextract128_imm VR128:$ext))>;
7733def : Pat<(alignedstore (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
7734                                (iPTR imm))), addr:$dst),
7735          (VEXTRACTF128mr addr:$dst, VR256:$src1,
7736           (EXTRACT_get_vextract128_imm VR128:$ext))>;
7737}
7738
7739//===----------------------------------------------------------------------===//
7740// VMASKMOV - Conditional SIMD Packed Loads and Stores
7741//
7742multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
7743                          Intrinsic IntLd, Intrinsic IntLd256,
7744                          Intrinsic IntSt, Intrinsic IntSt256> {
7745  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
7746             (ins VR128:$src1, f128mem:$src2),
7747             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7748             [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
7749             VEX_4V;
7750  def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
7751             (ins VR256:$src1, f256mem:$src2),
7752             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7753             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7754             VEX_4V, VEX_L;
7755  def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
7756             (ins f128mem:$dst, VR128:$src1, VR128:$src2),
7757             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7758             [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
7759  def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
7760             (ins f256mem:$dst, VR256:$src1, VR256:$src2),
7761             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7762             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
7763}
7764
7765let ExeDomain = SSEPackedSingle in
7766defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
7767                                 int_x86_avx_maskload_ps,
7768                                 int_x86_avx_maskload_ps_256,
7769                                 int_x86_avx_maskstore_ps,
7770                                 int_x86_avx_maskstore_ps_256>;
7771let ExeDomain = SSEPackedDouble in
7772defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
7773                                 int_x86_avx_maskload_pd,
7774                                 int_x86_avx_maskload_pd_256,
7775                                 int_x86_avx_maskstore_pd,
7776                                 int_x86_avx_maskstore_pd_256>;
7777
7778//===----------------------------------------------------------------------===//
7779// VPERMIL - Permute Single and Double Floating-Point Values
7780//
7781multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
7782                      RegisterClass RC, X86MemOperand x86memop_f,
7783                      X86MemOperand x86memop_i, PatFrag i_frag,
7784                      Intrinsic IntVar, ValueType vt> {
7785  def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
7786             (ins RC:$src1, RC:$src2),
7787             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7788             [(set RC:$dst, (IntVar RC:$src1, RC:$src2))]>, VEX_4V;
7789  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
7790             (ins RC:$src1, x86memop_i:$src2),
7791             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7792             [(set RC:$dst, (IntVar RC:$src1,
7793                             (bitconvert (i_frag addr:$src2))))]>, VEX_4V;
7794
7795  def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
7796             (ins RC:$src1, i8imm:$src2),
7797             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7798             [(set RC:$dst, (vt (X86VPermilp RC:$src1, (i8 imm:$src2))))]>, VEX;
7799  def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
7800             (ins x86memop_f:$src1, i8imm:$src2),
7801             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7802             [(set RC:$dst,
7803               (vt (X86VPermilp (memop addr:$src1), (i8 imm:$src2))))]>, VEX;
7804}
7805
7806let ExeDomain = SSEPackedSingle in {
7807  defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
7808                               loadv2i64, int_x86_avx_vpermilvar_ps, v4f32>;
7809  defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
7810                       loadv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L;
7811}
7812let ExeDomain = SSEPackedDouble in {
7813  defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
7814                               loadv2i64, int_x86_avx_vpermilvar_pd, v2f64>;
7815  defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
7816                       loadv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L;
7817}
7818
7819let Predicates = [HasAVX] in {
7820def : Pat<(v8i32 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
7821          (VPERMILPSYri VR256:$src1, imm:$imm)>;
7822def : Pat<(v4i64 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
7823          (VPERMILPDYri VR256:$src1, imm:$imm)>;
7824def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (loadv4i64 addr:$src1)),
7825                               (i8 imm:$imm))),
7826          (VPERMILPSYmi addr:$src1, imm:$imm)>;
7827def : Pat<(v4i64 (X86VPermilp (loadv4i64 addr:$src1), (i8 imm:$imm))),
7828          (VPERMILPDYmi addr:$src1, imm:$imm)>;
7829
7830def : Pat<(v2i64 (X86VPermilp VR128:$src1, (i8 imm:$imm))),
7831          (VPERMILPDri VR128:$src1, imm:$imm)>;
7832def : Pat<(v2i64 (X86VPermilp (loadv2i64 addr:$src1), (i8 imm:$imm))),
7833          (VPERMILPDmi addr:$src1, imm:$imm)>;
7834}
7835
7836//===----------------------------------------------------------------------===//
7837// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
7838//
7839let ExeDomain = SSEPackedSingle in {
7840def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
7841          (ins VR256:$src1, VR256:$src2, i8imm:$src3),
7842          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7843          [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2,
7844                              (i8 imm:$src3))))]>, VEX_4V, VEX_L;
7845def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
7846          (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
7847          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7848          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2),
7849                             (i8 imm:$src3)))]>, VEX_4V, VEX_L;
7850}
7851
7852let Predicates = [HasAVX] in {
7853def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
7854          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
7855def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
7856                  (loadv4f64 addr:$src2), (i8 imm:$imm))),
7857          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
7858}
7859
7860let Predicates = [HasAVX1Only] in {
7861def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
7862          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
7863def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
7864          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
7865def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
7866          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
7867def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
7868          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
7869
7870def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
7871                  (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
7872          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
7873def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
7874                  (loadv4i64 addr:$src2), (i8 imm:$imm))),
7875          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
7876def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
7877                  (bc_v32i8 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
7878          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
7879def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
7880                  (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
7881          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
7882}
7883
7884//===----------------------------------------------------------------------===//
7885// VZERO - Zero YMM registers
7886//
7887let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
7888            YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
7889  // Zero All YMM registers
7890  def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
7891                  [(int_x86_avx_vzeroall)]>, TB, VEX, VEX_L, Requires<[HasAVX]>;
7892
7893  // Zero Upper bits of YMM registers
7894  def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
7895                     [(int_x86_avx_vzeroupper)]>, TB, VEX, Requires<[HasAVX]>;
7896}
7897
7898//===----------------------------------------------------------------------===//
7899// Half precision conversion instructions
7900//===----------------------------------------------------------------------===//
7901multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
7902  def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7903             "vcvtph2ps\t{$src, $dst|$dst, $src}",
7904             [(set RC:$dst, (Int VR128:$src))]>,
7905             T8, OpSize, VEX;
7906  let neverHasSideEffects = 1, mayLoad = 1 in
7907  def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7908             "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX;
7909}
7910
7911multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
7912  def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
7913               (ins RC:$src1, i32i8imm:$src2),
7914               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7915               [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
7916               TA, OpSize, VEX;
7917  let neverHasSideEffects = 1, mayStore = 1 in
7918  def mr : Ii8<0x1D, MRMDestMem, (outs),
7919               (ins x86memop:$dst, RC:$src1, i32i8imm:$src2),
7920               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7921               TA, OpSize, VEX;
7922}
7923
7924let Predicates = [HasF16C] in {
7925  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
7926  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
7927  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
7928  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
7929}
7930
7931//===----------------------------------------------------------------------===//
7932// AVX2 Instructions
7933//===----------------------------------------------------------------------===//
7934
7935/// AVX2_binop_rmi_int - AVX2 binary operator with 8-bit immediate
7936multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr,
7937                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
7938                 X86MemOperand x86memop> {
7939  let isCommutable = 1 in
7940  def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
7941        (ins RC:$src1, RC:$src2, u32u8imm:$src3),
7942        !strconcat(OpcodeStr,
7943            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7944        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
7945        VEX_4V;
7946  def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
7947        (ins RC:$src1, x86memop:$src2, u32u8imm:$src3),
7948        !strconcat(OpcodeStr,
7949            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7950        [(set RC:$dst,
7951          (IntId RC:$src1,
7952           (bitconvert (memop_frag addr:$src2)), imm:$src3))]>,
7953        VEX_4V;
7954}
7955
7956let isCommutable = 0 in {
7957defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128,
7958                                   VR128, loadv2i64, i128mem>;
7959defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256,
7960                                    VR256, loadv4i64, i256mem>, VEX_L;
7961}
7962
7963def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2),
7964                  imm:$mask)),
7965          (VPBLENDDrri VR128:$src1, VR128:$src2, imm:$mask)>;
7966def : Pat<(v8i32 (X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2),
7967                  imm:$mask)),
7968          (VPBLENDDYrri VR256:$src1, VR256:$src2, imm:$mask)>;
7969
7970//===----------------------------------------------------------------------===//
7971// VPBROADCAST - Load from memory and broadcast to all elements of the
7972//               destination operand
7973//
7974multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
7975                          X86MemOperand x86memop, PatFrag ld_frag,
7976                          Intrinsic Int128, Intrinsic Int256> {
7977  def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
7978                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7979                  [(set VR128:$dst, (Int128 VR128:$src))]>, VEX;
7980  def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
7981                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7982                  [(set VR128:$dst,
7983                    (Int128 (scalar_to_vector (ld_frag addr:$src))))]>, VEX;
7984  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
7985                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7986                   [(set VR256:$dst, (Int256 VR128:$src))]>, VEX, VEX_L;
7987  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
7988                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7989                   [(set VR256:$dst,
7990                    (Int256 (scalar_to_vector (ld_frag addr:$src))))]>,
7991                   VEX, VEX_L;
7992}
7993
7994defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
7995                                    int_x86_avx2_pbroadcastb_128,
7996                                    int_x86_avx2_pbroadcastb_256>;
7997defm VPBROADCASTW  : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
7998                                    int_x86_avx2_pbroadcastw_128,
7999                                    int_x86_avx2_pbroadcastw_256>;
8000defm VPBROADCASTD  : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
8001                                    int_x86_avx2_pbroadcastd_128,
8002                                    int_x86_avx2_pbroadcastd_256>;
8003defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
8004                                    int_x86_avx2_pbroadcastq_128,
8005                                    int_x86_avx2_pbroadcastq_256>;
8006
8007let Predicates = [HasAVX2] in {
8008  def : Pat<(v16i8 (X86VBroadcast (loadi8 addr:$src))),
8009          (VPBROADCASTBrm addr:$src)>;
8010  def : Pat<(v32i8 (X86VBroadcast (loadi8 addr:$src))),
8011          (VPBROADCASTBYrm addr:$src)>;
8012  def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
8013          (VPBROADCASTWrm addr:$src)>;
8014  def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
8015          (VPBROADCASTWYrm addr:$src)>;
8016  def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
8017          (VPBROADCASTDrm addr:$src)>;
8018  def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
8019          (VPBROADCASTDYrm addr:$src)>;
8020  def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
8021          (VPBROADCASTQrm addr:$src)>;
8022  def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
8023          (VPBROADCASTQYrm addr:$src)>;
8024
8025  def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))),
8026          (VPBROADCASTBrr VR128:$src)>;
8027  def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))),
8028          (VPBROADCASTBYrr VR128:$src)>;
8029  def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))),
8030          (VPBROADCASTWrr VR128:$src)>;
8031  def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))),
8032          (VPBROADCASTWYrr VR128:$src)>;
8033  def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))),
8034          (VPBROADCASTDrr VR128:$src)>;
8035  def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))),
8036          (VPBROADCASTDYrr VR128:$src)>;
8037  def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))),
8038          (VPBROADCASTQrr VR128:$src)>;
8039  def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))),
8040          (VPBROADCASTQYrr VR128:$src)>;
8041  def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))),
8042          (VBROADCASTSSrr VR128:$src)>;
8043  def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))),
8044          (VBROADCASTSSYrr VR128:$src)>;
8045  def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))),
8046          (VPBROADCASTQrr VR128:$src)>;
8047  def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))),
8048          (VBROADCASTSDYrr VR128:$src)>;
8049
8050  // Provide fallback in case the load node that is used in the patterns above
8051  // is used by additional users, which prevents the pattern selection.
8052  let AddedComplexity = 20 in {
8053    def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
8054              (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
8055    def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
8056              (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
8057    def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
8058              (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
8059
8060    def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
8061              (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
8062    def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
8063              (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
8064    def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
8065              (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
8066  }
8067}
8068
8069// AVX1 broadcast patterns
8070let Predicates = [HasAVX1Only] in {
8071def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
8072          (VBROADCASTSSYrm addr:$src)>;
8073def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
8074          (VBROADCASTSDYrm addr:$src)>;
8075def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
8076          (VBROADCASTSSrm addr:$src)>;
8077}
8078
8079let Predicates = [HasAVX] in {
8080def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
8081          (VBROADCASTSSYrm addr:$src)>;
8082def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
8083          (VBROADCASTSDYrm addr:$src)>;
8084def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
8085          (VBROADCASTSSrm addr:$src)>;
8086
8087  // Provide fallback in case the load node that is used in the patterns above
8088  // is used by additional users, which prevents the pattern selection.
8089  let AddedComplexity = 20 in {
8090  // 128bit broadcasts:
8091  def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
8092            (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
8093  def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
8094            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
8095              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
8096              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
8097  def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
8098            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
8099              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm),
8100              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>;
8101
8102  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
8103            (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>;
8104  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
8105            (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
8106              (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), sub_xmm),
8107              (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), 1)>;
8108  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
8109            (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
8110              (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm),
8111              (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>;
8112  }
8113}
8114
8115//===----------------------------------------------------------------------===//
8116// VPERM - Permute instructions
8117//
8118
8119multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
8120                     ValueType OpVT> {
8121  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
8122                   (ins VR256:$src1, VR256:$src2),
8123                   !strconcat(OpcodeStr,
8124                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8125                   [(set VR256:$dst,
8126                     (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
8127                   VEX_4V, VEX_L;
8128  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
8129                   (ins VR256:$src1, i256mem:$src2),
8130                   !strconcat(OpcodeStr,
8131                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8132                   [(set VR256:$dst,
8133                     (OpVT (X86VPermv VR256:$src1,
8134                            (bitconvert (mem_frag addr:$src2)))))]>,
8135                   VEX_4V, VEX_L;
8136}
8137
8138defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32>;
8139let ExeDomain = SSEPackedSingle in
8140defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32>;
8141
8142multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
8143                         ValueType OpVT> {
8144  def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
8145                     (ins VR256:$src1, i8imm:$src2),
8146                     !strconcat(OpcodeStr,
8147                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8148                     [(set VR256:$dst,
8149                       (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
8150                     VEX, VEX_L;
8151  def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
8152                     (ins i256mem:$src1, i8imm:$src2),
8153                     !strconcat(OpcodeStr,
8154                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8155                     [(set VR256:$dst,
8156                       (OpVT (X86VPermi (mem_frag addr:$src1),
8157                              (i8 imm:$src2))))]>, VEX, VEX_L;
8158}
8159
8160defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64>, VEX_W;
8161let ExeDomain = SSEPackedDouble in
8162defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64>, VEX_W;
8163
8164//===----------------------------------------------------------------------===//
8165// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
8166//
8167def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
8168          (ins VR256:$src1, VR256:$src2, i8imm:$src3),
8169          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8170          [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
8171                            (i8 imm:$src3))))]>, VEX_4V, VEX_L;
8172def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
8173          (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
8174          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8175          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
8176                             (i8 imm:$src3)))]>, VEX_4V, VEX_L;
8177
8178let Predicates = [HasAVX2] in {
8179def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8180          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8181def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8182          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8183def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8184          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8185
8186def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (loadv4i64 addr:$src2)),
8187                  (i8 imm:$imm))),
8188          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
8189def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
8190                   (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
8191          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
8192def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)),
8193                  (i8 imm:$imm))),
8194          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
8195}
8196
8197
8198//===----------------------------------------------------------------------===//
8199// VINSERTI128 - Insert packed integer values
8200//
8201let neverHasSideEffects = 1 in {
8202def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
8203          (ins VR256:$src1, VR128:$src2, i8imm:$src3),
8204          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8205          []>, VEX_4V, VEX_L;
8206let mayLoad = 1 in
8207def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
8208          (ins VR256:$src1, i128mem:$src2, i8imm:$src3),
8209          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8210          []>, VEX_4V, VEX_L;
8211}
8212
8213let Predicates = [HasAVX2] in {
8214def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
8215                                   (iPTR imm)),
8216          (VINSERTI128rr VR256:$src1, VR128:$src2,
8217                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8218def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
8219                                   (iPTR imm)),
8220          (VINSERTI128rr VR256:$src1, VR128:$src2,
8221                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8222def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
8223                                   (iPTR imm)),
8224          (VINSERTI128rr VR256:$src1, VR128:$src2,
8225                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8226def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
8227                                   (iPTR imm)),
8228          (VINSERTI128rr VR256:$src1, VR128:$src2,
8229                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8230
8231def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
8232                                   (iPTR imm)),
8233          (VINSERTI128rm VR256:$src1, addr:$src2,
8234                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8235def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
8236                                   (bc_v4i32 (loadv2i64 addr:$src2)),
8237                                   (iPTR imm)),
8238          (VINSERTI128rm VR256:$src1, addr:$src2,
8239                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8240def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
8241                                   (bc_v16i8 (loadv2i64 addr:$src2)),
8242                                   (iPTR imm)),
8243          (VINSERTI128rm VR256:$src1, addr:$src2,
8244                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8245def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
8246                                   (bc_v8i16 (loadv2i64 addr:$src2)),
8247                                   (iPTR imm)),
8248          (VINSERTI128rm VR256:$src1, addr:$src2,
8249                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8250}
8251
8252//===----------------------------------------------------------------------===//
8253// VEXTRACTI128 - Extract packed integer values
8254//
8255def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
8256          (ins VR256:$src1, i8imm:$src2),
8257          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
8258          [(set VR128:$dst,
8259            (int_x86_avx2_vextracti128 VR256:$src1, imm:$src2))]>,
8260          VEX, VEX_L;
8261let neverHasSideEffects = 1, mayStore = 1 in
8262def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
8263          (ins i128mem:$dst, VR256:$src1, i8imm:$src2),
8264          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
8265          VEX, VEX_L;
8266
8267let Predicates = [HasAVX2] in {
8268def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8269          (v2i64 (VEXTRACTI128rr
8270                    (v4i64 VR256:$src1),
8271                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8272def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8273          (v4i32 (VEXTRACTI128rr
8274                    (v8i32 VR256:$src1),
8275                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8276def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8277          (v8i16 (VEXTRACTI128rr
8278                    (v16i16 VR256:$src1),
8279                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8280def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8281          (v16i8 (VEXTRACTI128rr
8282                    (v32i8 VR256:$src1),
8283                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8284
8285def : Pat<(store (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
8286                         (iPTR imm))), addr:$dst),
8287          (VEXTRACTI128mr addr:$dst, VR256:$src1,
8288           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8289def : Pat<(store (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
8290                         (iPTR imm))), addr:$dst),
8291          (VEXTRACTI128mr addr:$dst, VR256:$src1,
8292           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8293def : Pat<(store (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
8294                         (iPTR imm))), addr:$dst),
8295          (VEXTRACTI128mr addr:$dst, VR256:$src1,
8296           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8297def : Pat<(store (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
8298                         (iPTR imm))), addr:$dst),
8299          (VEXTRACTI128mr addr:$dst, VR256:$src1,
8300           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8301}
8302
8303//===----------------------------------------------------------------------===//
8304// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
8305//
8306multiclass avx2_pmovmask<string OpcodeStr,
8307                         Intrinsic IntLd128, Intrinsic IntLd256,
8308                         Intrinsic IntSt128, Intrinsic IntSt256> {
8309  def rm  : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
8310             (ins VR128:$src1, i128mem:$src2),
8311             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8312             [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, VEX_4V;
8313  def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
8314             (ins VR256:$src1, i256mem:$src2),
8315             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8316             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
8317             VEX_4V, VEX_L;
8318  def mr  : AVX28I<0x8e, MRMDestMem, (outs),
8319             (ins i128mem:$dst, VR128:$src1, VR128:$src2),
8320             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8321             [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
8322  def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
8323             (ins i256mem:$dst, VR256:$src1, VR256:$src2),
8324             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8325             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
8326}
8327
8328defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
8329                                int_x86_avx2_maskload_d,
8330                                int_x86_avx2_maskload_d_256,
8331                                int_x86_avx2_maskstore_d,
8332                                int_x86_avx2_maskstore_d_256>;
8333defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
8334                                int_x86_avx2_maskload_q,
8335                                int_x86_avx2_maskload_q_256,
8336                                int_x86_avx2_maskstore_q,
8337                                int_x86_avx2_maskstore_q_256>, VEX_W;
8338
8339
8340//===----------------------------------------------------------------------===//
8341// Variable Bit Shifts
8342//
8343multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
8344                          ValueType vt128, ValueType vt256> {
8345  def rr  : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
8346             (ins VR128:$src1, VR128:$src2),
8347             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8348             [(set VR128:$dst,
8349               (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
8350             VEX_4V;
8351  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
8352             (ins VR128:$src1, i128mem:$src2),
8353             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8354             [(set VR128:$dst,
8355               (vt128 (OpNode VR128:$src1,
8356                       (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
8357             VEX_4V;
8358  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
8359             (ins VR256:$src1, VR256:$src2),
8360             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8361             [(set VR256:$dst,
8362               (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
8363             VEX_4V, VEX_L;
8364  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
8365             (ins VR256:$src1, i256mem:$src2),
8366             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8367             [(set VR256:$dst,
8368               (vt256 (OpNode VR256:$src1,
8369                       (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
8370             VEX_4V, VEX_L;
8371}
8372
8373defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
8374defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
8375defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
8376defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
8377defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
8378
8379//===----------------------------------------------------------------------===//
8380// VGATHER - GATHER Operations
8381multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
8382                       X86MemOperand memop128, X86MemOperand memop256> {
8383  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst, VR128:$mask_wb),
8384            (ins VR128:$src1, memop128:$src2, VR128:$mask),
8385            !strconcat(OpcodeStr,
8386              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8387            []>, VEX_4VOp3;
8388  def Yrm : AVX28I<opc, MRMSrcMem, (outs RC256:$dst, RC256:$mask_wb),
8389            (ins RC256:$src1, memop256:$src2, RC256:$mask),
8390            !strconcat(OpcodeStr,
8391              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8392            []>, VEX_4VOp3, VEX_L;
8393}
8394
8395let mayLoad = 1, Constraints
8396  = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
8397  in {
8398  defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W;
8399  defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W;
8400  defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>;
8401  defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>;
8402  defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx64mem, vx64mem>, VEX_W;
8403  defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx64mem, vy64mem>, VEX_W;
8404  defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx32mem, vy32mem>;
8405  defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx32mem, vy32mem>;
8406}
8407