pmmintrin.h revision 360784
1/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __PMMINTRIN_H
11#define __PMMINTRIN_H
12
13#include <emmintrin.h>
14
15/* Define the default attributes for the functions in this file. */
16#define __DEFAULT_FN_ATTRS \
17  __attribute__((__always_inline__, __nodebug__, __target__("sse3"), __min_vector_width__(128)))
18
19/// Loads data from an unaligned memory location to elements in a 128-bit
20///    vector.
21///
22///    If the address of the data is not 16-byte aligned, the instruction may
23///    read two adjacent aligned blocks of memory to retrieve the requested
24///    data.
25///
26/// \headerfile <x86intrin.h>
27///
28/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
29///
30/// \param __p
31///    A pointer to a 128-bit integer vector containing integer values.
32/// \returns A 128-bit vector containing the moved values.
33static __inline__ __m128i __DEFAULT_FN_ATTRS
34_mm_lddqu_si128(__m128i const *__p)
35{
36  return (__m128i)__builtin_ia32_lddqu((char const *)__p);
37}
38
39/// Adds the even-indexed values and subtracts the odd-indexed values of
40///    two 128-bit vectors of [4 x float].
41///
42/// \headerfile <x86intrin.h>
43///
44/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
45///
46/// \param __a
47///    A 128-bit vector of [4 x float] containing the left source operand.
48/// \param __b
49///    A 128-bit vector of [4 x float] containing the right source operand.
50/// \returns A 128-bit vector of [4 x float] containing the alternating sums and
51///    differences of both operands.
52static __inline__ __m128 __DEFAULT_FN_ATTRS
53_mm_addsub_ps(__m128 __a, __m128 __b)
54{
55  return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
56}
57
58/// Horizontally adds the adjacent pairs of values contained in two
59///    128-bit vectors of [4 x float].
60///
61/// \headerfile <x86intrin.h>
62///
63/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
64///
65/// \param __a
66///    A 128-bit vector of [4 x float] containing one of the source operands.
67///    The horizontal sums of the values are stored in the lower bits of the
68///    destination.
69/// \param __b
70///    A 128-bit vector of [4 x float] containing one of the source operands.
71///    The horizontal sums of the values are stored in the upper bits of the
72///    destination.
73/// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
74///    both operands.
75static __inline__ __m128 __DEFAULT_FN_ATTRS
76_mm_hadd_ps(__m128 __a, __m128 __b)
77{
78  return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
79}
80
81/// Horizontally subtracts the adjacent pairs of values contained in two
82///    128-bit vectors of [4 x float].
83///
84/// \headerfile <x86intrin.h>
85///
86/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
87///
88/// \param __a
89///    A 128-bit vector of [4 x float] containing one of the source operands.
90///    The horizontal differences between the values are stored in the lower
91///    bits of the destination.
92/// \param __b
93///    A 128-bit vector of [4 x float] containing one of the source operands.
94///    The horizontal differences between the values are stored in the upper
95///    bits of the destination.
96/// \returns A 128-bit vector of [4 x float] containing the horizontal
97///    differences of both operands.
98static __inline__ __m128 __DEFAULT_FN_ATTRS
99_mm_hsub_ps(__m128 __a, __m128 __b)
100{
101  return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
102}
103
104/// Moves and duplicates odd-indexed values from a 128-bit vector
105///    of [4 x float] to float values stored in a 128-bit vector of
106///    [4 x float].
107///
108/// \headerfile <x86intrin.h>
109///
110/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
111///
112/// \param __a
113///    A 128-bit vector of [4 x float]. \n
114///    Bits [127:96] of the source are written to bits [127:96] and [95:64] of
115///    the destination. \n
116///    Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
117///    destination.
118/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
119///    values.
120static __inline__ __m128 __DEFAULT_FN_ATTRS
121_mm_movehdup_ps(__m128 __a)
122{
123  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
124}
125
126/// Duplicates even-indexed values from a 128-bit vector of
127///    [4 x float] to float values stored in a 128-bit vector of [4 x float].
128///
129/// \headerfile <x86intrin.h>
130///
131/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
132///
133/// \param __a
134///    A 128-bit vector of [4 x float] \n
135///    Bits [95:64] of the source are written to bits [127:96] and [95:64] of
136///    the destination. \n
137///    Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
138///    destination.
139/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
140///    values.
141static __inline__ __m128 __DEFAULT_FN_ATTRS
142_mm_moveldup_ps(__m128 __a)
143{
144  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
145}
146
147/// Adds the even-indexed values and subtracts the odd-indexed values of
148///    two 128-bit vectors of [2 x double].
149///
150/// \headerfile <x86intrin.h>
151///
152/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
153///
154/// \param __a
155///    A 128-bit vector of [2 x double] containing the left source operand.
156/// \param __b
157///    A 128-bit vector of [2 x double] containing the right source operand.
158/// \returns A 128-bit vector of [2 x double] containing the alternating sums
159///    and differences of both operands.
160static __inline__ __m128d __DEFAULT_FN_ATTRS
161_mm_addsub_pd(__m128d __a, __m128d __b)
162{
163  return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
164}
165
166/// Horizontally adds the pairs of values contained in two 128-bit
167///    vectors of [2 x double].
168///
169/// \headerfile <x86intrin.h>
170///
171/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
172///
173/// \param __a
174///    A 128-bit vector of [2 x double] containing one of the source operands.
175///    The horizontal sum of the values is stored in the lower bits of the
176///    destination.
177/// \param __b
178///    A 128-bit vector of [2 x double] containing one of the source operands.
179///    The horizontal sum of the values is stored in the upper bits of the
180///    destination.
181/// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
182///    both operands.
183static __inline__ __m128d __DEFAULT_FN_ATTRS
184_mm_hadd_pd(__m128d __a, __m128d __b)
185{
186  return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
187}
188
189/// Horizontally subtracts the pairs of values contained in two 128-bit
190///    vectors of [2 x double].
191///
192/// \headerfile <x86intrin.h>
193///
194/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
195///
196/// \param __a
197///    A 128-bit vector of [2 x double] containing one of the source operands.
198///    The horizontal difference of the values is stored in the lower bits of
199///    the destination.
200/// \param __b
201///    A 128-bit vector of [2 x double] containing one of the source operands.
202///    The horizontal difference of the values is stored in the upper bits of
203///    the destination.
204/// \returns A 128-bit vector of [2 x double] containing the horizontal
205///    differences of both operands.
206static __inline__ __m128d __DEFAULT_FN_ATTRS
207_mm_hsub_pd(__m128d __a, __m128d __b)
208{
209  return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
210}
211
212/// Moves and duplicates one double-precision value to double-precision
213///    values stored in a 128-bit vector of [2 x double].
214///
215/// \headerfile <x86intrin.h>
216///
217/// \code
218/// __m128d _mm_loaddup_pd(double const *dp);
219/// \endcode
220///
221/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
222///
223/// \param dp
224///    A pointer to a double-precision value to be moved and duplicated.
225/// \returns A 128-bit vector of [2 x double] containing the moved and
226///    duplicated values.
227#define        _mm_loaddup_pd(dp)        _mm_load1_pd(dp)
228
229/// Moves and duplicates the double-precision value in the lower bits of
230///    a 128-bit vector of [2 x double] to double-precision values stored in a
231///    128-bit vector of [2 x double].
232///
233/// \headerfile <x86intrin.h>
234///
235/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
236///
237/// \param __a
238///    A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
239///    [127:64] and [63:0] of the destination.
240/// \returns A 128-bit vector of [2 x double] containing the moved and
241///    duplicated values.
242static __inline__ __m128d __DEFAULT_FN_ATTRS
243_mm_movedup_pd(__m128d __a)
244{
245  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
246}
247
248/// Establishes a linear address memory range to be monitored and puts
249///    the processor in the monitor event pending state. Data stored in the
250///    monitored address range causes the processor to exit the pending state.
251///
252/// \headerfile <x86intrin.h>
253///
254/// This intrinsic corresponds to the <c> MONITOR </c> instruction.
255///
256/// \param __p
257///    The memory range to be monitored. The size of the range is determined by
258///    CPUID function 0000_0005h.
259/// \param __extensions
260///    Optional extensions for the monitoring state.
261/// \param __hints
262///    Optional hints for the monitoring state.
263static __inline__ void __DEFAULT_FN_ATTRS
264_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
265{
266  __builtin_ia32_monitor(__p, __extensions, __hints);
267}
268
269/// Used with the MONITOR instruction to wait while the processor is in
270///    the monitor event pending state. Data stored in the monitored address
271///    range causes the processor to exit the pending state.
272///
273/// \headerfile <x86intrin.h>
274///
275/// This intrinsic corresponds to the <c> MWAIT </c> instruction.
276///
277/// \param __extensions
278///    Optional extensions for the monitoring state, which may vary by
279///    processor.
280/// \param __hints
281///    Optional hints for the monitoring state, which may vary by processor.
282static __inline__ void __DEFAULT_FN_ATTRS
283_mm_mwait(unsigned __extensions, unsigned __hints)
284{
285  __builtin_ia32_mwait(__extensions, __hints);
286}
287
288#undef __DEFAULT_FN_ATTRS
289
290#endif /* __PMMINTRIN_H */
291