4 // This header file provides a simple API translation layer
5 // between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
7 // This header file does not yet translate all of the SSE intrinsics.
9 // Contributors to this work are:
10 // John W. Ratcliff <jratcliffscarab@gmail.com>
11 // Brandon Rowlett <browlett@nvidia.com>
12 // Ken Fast <kfast@gdeb.com>
13 // Eric van Beurden <evanbeurden@nvidia.com>
14 // Alexander Potylitsin <apotylitsin@nvidia.com>
15 // Hasindu Gamaarachchi <hasindu2008@gmail.com>
16 // Jim Huang <jserv@biilabs.io>
17 // Mark Cheng <marktwtn@biilabs.io>
18 // Malcolm James MacLeod <malcolm@gulden.com>
19 // Devin Hussey (easyaspi314) <husseydevin@gmail.com>
20 // Sebastian Pop <spop@amazon.com>
25 * Permission is hereby granted, free of charge, to any person obtaining a copy
26 * of this software and associated documentation files (the "Software"), to deal
27 * in the Software without restriction, including without limitation the rights
28 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
29 * copies of the Software, and to permit persons to whom the Software is
30 * furnished to do so, subject to the following conditions:
32 * The above copyright notice and this permission notice shall be included in
33 * all copies or substantial portions of the Software.
35 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
36 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
37 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
38 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
39 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
40 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
44 #if defined(__GNUC__) || defined(__clang__)
46 #pragma push_macro("FORCE_INLINE")
47 #pragma push_macro("ALIGN_STRUCT")
48 #define FORCE_INLINE static inline __attribute__((always_inline))
49 #define ALIGN_STRUCT(x) __attribute__((aligned(x)))
53 #error "Macro name collisions may happens with unknown compiler"
57 #define FORCE_INLINE static inline
59 #define ALIGN_STRUCT(x) __declspec(align(x))
70 * MACRO for shuffle parameter for _mm_shuffle_ps().
71 * Argument fp3 is a digit[0123] that represents the fp from argument "b"
72 * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
73 * for fp2 in result. fp1 is a digit[0123] that represents the fp from
74 * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
75 * fp0 is the same for fp0 of result.
77 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
78 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
80 /* indicate immediate constant argument in a given range */
81 #define __constrange(a, b) const
83 typedef float32x2_t __m64;
84 typedef float32x4_t __m128;
85 typedef int64x2_t __m128i;
87 // ******************************************
88 // type-safe casting between types
89 // ******************************************
91 #define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
92 #define vreinterpretq_m128_f32(x) (x)
93 #define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
95 #define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
96 #define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
97 #define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
98 #define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
100 #define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
101 #define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
102 #define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
103 #define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
105 #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
106 #define vreinterpretq_f32_m128(x) (x)
107 #define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
109 #define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
110 #define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
111 #define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
112 #define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
114 #define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
115 #define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
116 #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
117 #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
119 #define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
120 #define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
121 #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
122 #define vreinterpretq_m128i_s64(x) (x)
124 #define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
125 #define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
126 #define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
127 #define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
129 #define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
130 #define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
131 #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
132 #define vreinterpretq_s64_m128i(x) (x)
134 #define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
135 #define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
136 #define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
137 #define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
139 // A struct is defined in this header file called 'SIMDVec' which can be used
140 // by applications which attempt to access the contents of an _m128 struct
141 // directly. It is important to note that accessing the __m128 struct directly
142 // is bad coding practice by Microsoft: @see:
143 // https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
145 // However, some legacy source code may try to access the contents of an __m128
146 // struct directly so the developer can use the SIMDVec as an alias for it. Any
147 // casting must be done manually by the developer, as you cannot cast or
148 // otherwise alias the base NEON data type for intrinsic operations.
150 // union intended to allow direct access to an __m128 variable using the names
151 // that the MSVC compiler provides. This union should really only be used when
152 // trying to access the members of the vector as integer values. GCC/clang
153 // allow native access to the float members through a simple array access
154 // operator (in C since 4.6, in C++ since 4.8).
156 // Ideally direct accesses to SIMD vectors should not be used since it can cause
157 // a performance hit. If it really is needed however, the original __m128
158 // variable can be aliased with a pointer to this union and used to access
159 // individual components. The use of this union should be hidden behind a macro
160 // that is used throughout the codebase to access the members instead of always
161 // declaring this type of variable.
162 typedef union ALIGN_STRUCT(16) SIMDVec {
163 float m128_f32[4]; // as floats - do not to use this. Added for convenience.
164 int8_t m128_i8[16]; // as signed 8-bit integers.
165 int16_t m128_i16[8]; // as signed 16-bit integers.
166 int32_t m128_i32[4]; // as signed 32-bit integers.
167 int64_t m128_i64[2]; // as signed 64-bit integers.
168 uint8_t m128_u8[16]; // as unsigned 8-bit integers.
169 uint16_t m128_u16[8]; // as unsigned 16-bit integers.
170 uint32_t m128_u32[4]; // as unsigned 32-bit integers.
171 uint64_t m128_u64[2]; // as unsigned 64-bit integers.
174 // casting using SIMDVec
175 #define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
176 #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
179 // ******************************************
180 // Backwards compatibility for compilers with lack of specific type support
181 // ******************************************
183 // Older gcc does not define vld1q_u8_x4 type
184 #if defined(__GNUC__) && !defined(__clang__)
186 FORCE_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t *p)
189 ret.val[0] = vld1q_u8(p + 0);
190 ret.val[1] = vld1q_u8(p + 16);
191 ret.val[2] = vld1q_u8(p + 32);
192 ret.val[3] = vld1q_u8(p + 48);
199 // ******************************************
201 // ******************************************
203 // Loads one cache line of data from address p to a location closer to the
204 // processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
205 FORCE_INLINE void _mm_prefetch(const void *p, int i)
208 __builtin_prefetch(p);
211 // extracts the lower order floating point value from the parameter :
212 // https://msdn.microsoft.com/en-us/library/bb514059%28v=vs.120%29.aspx?f=255&MSPPError=-2147217396
213 FORCE_INLINE float _mm_cvtss_f32(__m128 a)
215 return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
218 // Sets the 128-bit value to zero
219 // https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
220 FORCE_INLINE __m128i _mm_setzero_si128(void)
222 return vreinterpretq_m128i_s32(vdupq_n_s32(0));
225 // Clears the four single-precision, floating-point values.
226 // https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
227 FORCE_INLINE __m128 _mm_setzero_ps(void)
229 return vreinterpretq_m128_f32(vdupq_n_f32(0));
232 // Sets the four single-precision, floating-point values to w.
234 // r0 := r1 := r2 := r3 := w
236 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
237 FORCE_INLINE __m128 _mm_set1_ps(float _w)
239 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
242 // Sets the four single-precision, floating-point values to w.
243 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
244 FORCE_INLINE __m128 _mm_set_ps1(float _w)
246 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
249 // Sets the four single-precision, floating-point values to the four inputs.
250 // https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
251 FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
253 float __attribute__((aligned(16))) data[4] = {x, y, z, w};
254 return vreinterpretq_m128_f32(vld1q_f32(data));
257 // Sets the four single-precision, floating-point values to the four inputs in
259 // https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
260 FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
262 float __attribute__((aligned(16))) data[4] = {w, z, y, x};
263 return vreinterpretq_m128_f32(vld1q_f32(data));
266 // Sets the 8 signed 16-bit integer values in reverse order.
273 FORCE_INLINE __m128i _mm_setr_epi16(short w0,
282 int16_t __attribute__((aligned(16)))
283 data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
284 return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
287 // Sets the 4 signed 32-bit integer values in reverse order
288 // https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
289 FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
291 int32_t __attribute__((aligned(16))) data[4] = {i3, i2, i1, i0};
292 return vreinterpretq_m128i_s32(vld1q_s32(data));
295 // Sets the 16 signed 8-bit integer values to b.
302 // https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
303 FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
305 return vreinterpretq_m128i_s8(vdupq_n_s8(w));
308 // Sets the 8 signed 16-bit integer values to w.
315 // https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
316 FORCE_INLINE __m128i _mm_set1_epi16(short w)
318 return vreinterpretq_m128i_s16(vdupq_n_s16(w));
321 // Sets the 16 signed 8-bit integer values.
322 // https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
323 FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
340 int8_t __attribute__((aligned(16)))
341 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
342 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
343 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
344 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
345 return (__m128i) vld1q_s8(data);
348 // Sets the 8 signed 16-bit integer values.
349 // https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
350 FORCE_INLINE __m128i _mm_set_epi16(short i7,
359 int16_t __attribute__((aligned(16)))
360 data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
361 return vreinterpretq_m128i_s16(vld1q_s16(data));
364 // Sets the 16 signed 8-bit integer values in reverse order.
365 // https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
366 FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
383 int8_t __attribute__((aligned(16)))
384 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
385 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
386 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
387 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
388 return (__m128i) vld1q_s8(data);
391 // Sets the 4 signed 32-bit integer values to i.
398 // https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
399 FORCE_INLINE __m128i _mm_set1_epi32(int _i)
401 return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
404 // Sets the 2 signed 64-bit integer values to i.
405 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
406 FORCE_INLINE __m128i _mm_set1_epi64(int64_t _i)
408 return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
411 // Sets the 2 signed 64-bit integer values to i.
412 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x&expand=4961
413 FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
415 return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
418 // Sets the 4 signed 32-bit integer values.
419 // https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
420 FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
422 int32_t __attribute__((aligned(16))) data[4] = {i0, i1, i2, i3};
423 return vreinterpretq_m128i_s32(vld1q_s32(data));
426 // Returns the __m128i structure with its two 64-bit integer values
427 // initialized to the values of the two 64-bit integers passed in.
428 // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
429 FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
431 int64_t __attribute__((aligned(16))) data[2] = {i2, i1};
432 return vreinterpretq_m128i_s64(vld1q_s64(data));
435 // Stores four single-precision, floating-point values.
436 // https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
437 FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
439 vst1q_f32(p, vreinterpretq_f32_m128(a));
442 // Stores four single-precision, floating-point values.
443 // https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
444 FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
446 vst1q_f32(p, vreinterpretq_f32_m128(a));
449 // Stores four 32-bit integer values as (as a __m128i value) at the address p.
450 // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
451 FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
453 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
456 // Stores four 32-bit integer values as (as a __m128i value) at the address p.
457 // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
458 FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
460 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
463 // Stores the lower single - precision, floating - point value.
464 // https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
465 FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
467 vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
470 // Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
471 // https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
472 FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
474 uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
475 uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
476 *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
479 // Stores the lower two single-precision floating point values of a to the
485 // https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
486 FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
488 *p = vget_low_f32(a);
491 // Stores the upper two single-precision, floating-point values of a to the
497 // https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
498 FORCE_INLINE void _mm_storeh_pi(__m64 * p, __m128 a)
500 *p = vget_high_f32(a);
503 // Loads a single single-precision, floating-point value, copying it into all
505 // https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
506 FORCE_INLINE __m128 _mm_load1_ps(const float *p)
508 return vreinterpretq_m128_f32(vld1q_dup_f32(p));
510 #define _mm_load_ps1 _mm_load1_ps
512 // Sets the lower two single-precision, floating-point values with 64
513 // bits of data loaded from the address p; the upper two values are passed
522 // https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
523 FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
525 return vreinterpretq_m128_f32(
526 vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
529 // Sets the upper two single-precision, floating-point values with 64
530 // bits of data loaded from the address p; the lower two values are passed
538 // https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
539 FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
541 return vreinterpretq_m128_f32(
542 vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
545 // Loads four single-precision, floating-point values.
546 // https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
547 FORCE_INLINE __m128 _mm_load_ps(const float *p)
549 return vreinterpretq_m128_f32(vld1q_f32(p));
552 // Loads four single-precision, floating-point values.
553 // https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
554 FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
556 // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
557 // equivalent for neon
558 return vreinterpretq_m128_f32(vld1q_f32(p));
561 // Loads an single - precision, floating - point value into the low word and
562 // clears the upper three words.
563 // https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
564 FORCE_INLINE __m128 _mm_load_ss(const float *p)
566 return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
569 FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
571 /* Load the lower 64 bits of the value pointed to by p into the
572 * lower 64 bits of the result, zeroing the upper 64 bits of the result.
574 return vreinterpretq_m128i_s32(vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
577 // ******************************************
578 // Logic/Binary operations
579 // ******************************************
581 // Compares for inequality.
582 // https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
583 FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
585 return vreinterpretq_m128_u32(vmvnq_u32(
586 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
589 // Computes the bitwise AND-NOT of the four single-precision, floating-point
590 // values of a and b.
597 // https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
598 FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
600 return vreinterpretq_m128_s32(
601 vbicq_s32(vreinterpretq_s32_m128(b),
602 vreinterpretq_s32_m128(a))); // *NOTE* argument swap
605 // Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
606 // 128-bit value in a.
610 // https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
611 FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
613 return vreinterpretq_m128i_s32(
614 vbicq_s32(vreinterpretq_s32_m128i(b),
615 vreinterpretq_s32_m128i(a))); // *NOTE* argument swap
618 // Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
623 // https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
624 FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
626 return vreinterpretq_m128i_s32(
627 vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
630 // Computes the bitwise AND of the four single-precision, floating-point values
638 // https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
639 FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
641 return vreinterpretq_m128_s32(
642 vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
645 // Computes the bitwise OR of the four single-precision, floating-point values
647 // https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
648 FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
650 return vreinterpretq_m128_s32(
651 vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
654 // Computes bitwise EXOR (exclusive-or) of the four single-precision,
655 // floating-point values of a and b.
656 // https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
657 FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
659 return vreinterpretq_m128_s32(
660 veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
663 // Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
667 // https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
668 FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
670 return vreinterpretq_m128i_s32(
671 vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
674 // Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
675 // b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
676 FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
678 return vreinterpretq_m128i_s32(
679 veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
682 // Moves the upper two values of B into the lower two values of A.
688 FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
690 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
691 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
692 return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
695 // Moves the lower two values of B into the upper two values of A.
701 FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
703 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
704 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
705 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
708 FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
710 return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
713 FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
715 return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
718 FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
720 return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
723 // Takes the upper 64 bits of a and places it in the low end of the result
724 // Takes the lower 64 bits of b and places it into the high end of the result.
725 FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
727 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
728 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
729 return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
732 // takes the lower two 32-bit values from a and swaps them and places in high
733 // end of result takes the higher two 32 bit values from b and swaps them and
734 // places in low end of result.
735 FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
737 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
738 float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
739 return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
742 FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
744 float32x2_t a21 = vget_high_f32(
745 vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
746 float32x2_t b03 = vget_low_f32(
747 vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
748 return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
751 FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
753 float32x2_t a03 = vget_low_f32(
754 vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
755 float32x2_t b21 = vget_high_f32(
756 vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
757 return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
760 FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
762 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
763 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
764 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
767 FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
769 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
770 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
771 return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
774 FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
776 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
777 float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
778 return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
781 // keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
783 FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
785 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
786 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
787 return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
790 FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
792 float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
793 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
794 return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
797 FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
800 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
801 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
802 return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
805 FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
807 float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
809 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
810 return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
813 FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
815 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
817 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
818 float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
819 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
820 return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
823 FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
826 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
827 float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
828 return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
831 FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
833 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
834 float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
835 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
836 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
837 return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
840 FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
842 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
843 float32_t b2 = vgetq_lane_f32(b, 2);
844 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
845 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
846 return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
849 FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
851 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
852 float32_t b2 = vgetq_lane_f32(b, 2);
853 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
854 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
855 return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
858 // NEON does not support a general purpose permute intrinsic
859 // Selects four specific single-precision, floating-point values from a and b,
860 // based on the mask i.
861 // https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
862 #if 0 /* C version */
863 FORCE_INLINE __m128 _mm_shuffle_ps_default(__m128 a,
865 __constrange(0, 255) int imm)
868 ret[0] = a[imm & 0x3];
869 ret[1] = a[(imm >> 2) & 0x3];
870 ret[2] = b[(imm >> 4) & 0x03];
871 ret[3] = b[(imm >> 6) & 0x03];
875 #define _mm_shuffle_ps_default(a, b, imm) \
879 vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) &0x3)); \
880 ret = vsetq_lane_f32( \
881 vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
883 ret = vsetq_lane_f32( \
884 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
886 ret = vsetq_lane_f32( \
887 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
889 vreinterpretq_m128_f32(ret); \
892 // FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
894 #if defined(__clang__)
895 #define _mm_shuffle_ps(a, b, imm) \
897 float32x4_t _input1 = vreinterpretq_f32_m128(a); \
898 float32x4_t _input2 = vreinterpretq_f32_m128(b); \
899 float32x4_t _shuf = \
900 __builtin_shufflevector(_input1, _input2, \
902 ((imm) >> 2) & 0x3, \
903 (((imm) >> 4) & 0x3) + 4, \
904 (((imm) >> 6) & 0x3) + 4); \
905 vreinterpretq_m128_f32(_shuf); \
908 #define _mm_shuffle_ps(a, b, imm) \
912 case _MM_SHUFFLE(1, 0, 3, 2): \
913 ret = _mm_shuffle_ps_1032((a), (b)); \
915 case _MM_SHUFFLE(2, 3, 0, 1): \
916 ret = _mm_shuffle_ps_2301((a), (b)); \
918 case _MM_SHUFFLE(0, 3, 2, 1): \
919 ret = _mm_shuffle_ps_0321((a), (b)); \
921 case _MM_SHUFFLE(2, 1, 0, 3): \
922 ret = _mm_shuffle_ps_2103((a), (b)); \
924 case _MM_SHUFFLE(1, 0, 1, 0): \
925 ret = _mm_movelh_ps((a), (b)); \
927 case _MM_SHUFFLE(1, 0, 0, 1): \
928 ret = _mm_shuffle_ps_1001((a), (b)); \
930 case _MM_SHUFFLE(0, 1, 0, 1): \
931 ret = _mm_shuffle_ps_0101((a), (b)); \
933 case _MM_SHUFFLE(3, 2, 1, 0): \
934 ret = _mm_shuffle_ps_3210((a), (b)); \
936 case _MM_SHUFFLE(0, 0, 1, 1): \
937 ret = _mm_shuffle_ps_0011((a), (b)); \
939 case _MM_SHUFFLE(0, 0, 2, 2): \
940 ret = _mm_shuffle_ps_0022((a), (b)); \
942 case _MM_SHUFFLE(2, 2, 0, 0): \
943 ret = _mm_shuffle_ps_2200((a), (b)); \
945 case _MM_SHUFFLE(3, 2, 0, 2): \
946 ret = _mm_shuffle_ps_3202((a), (b)); \
948 case _MM_SHUFFLE(3, 2, 3, 2): \
949 ret = _mm_movehl_ps((b), (a)); \
951 case _MM_SHUFFLE(1, 1, 3, 3): \
952 ret = _mm_shuffle_ps_1133((a), (b)); \
954 case _MM_SHUFFLE(2, 0, 1, 0): \
955 ret = _mm_shuffle_ps_2010((a), (b)); \
957 case _MM_SHUFFLE(2, 0, 0, 1): \
958 ret = _mm_shuffle_ps_2001((a), (b)); \
960 case _MM_SHUFFLE(2, 0, 3, 2): \
961 ret = _mm_shuffle_ps_2032((a), (b)); \
964 ret = _mm_shuffle_ps_default((a), (b), (imm)); \
971 // Takes the upper 64 bits of a and places it in the low end of the result
972 // Takes the lower 64 bits of a and places it into the high end of the result.
973 FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
975 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
976 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
977 return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
980 // takes the lower two 32-bit values from a and swaps them and places in low end
981 // of result takes the higher two 32 bit values from a and swaps them and places
982 // in high end of result.
983 FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
985 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
986 int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
987 return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
990 // rotates the least significant 32 bits into the most signficant 32 bits, and
991 // shifts the rest down
992 FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
994 return vreinterpretq_m128i_s32(
995 vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
998 // rotates the most significant 32 bits into the least signficant 32 bits, and
999 // shifts the rest up
1000 FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
1002 return vreinterpretq_m128i_s32(
1003 vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
1006 // gets the lower 64 bits of a, and places it in the upper 64 bits
1007 // gets the lower 64 bits of a and places it in the lower 64 bits
1008 FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
1010 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1011 return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
1014 // gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
1015 // lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
1016 FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
1018 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1019 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1020 return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
1023 // gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
1024 // upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
1025 // places it in the lower 64 bits
1026 FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
1028 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1029 return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
1032 FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
1034 int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
1035 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1036 return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
1039 FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
1041 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1042 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1043 return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
1046 FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
1048 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1049 int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
1050 return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
1053 // Shuffle packed 8-bit integers in a according to shuffle control mask in the
1054 // corresponding 8-bit element of b, and store the results in dst.
1055 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8&expand=5146
1056 FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
1058 int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a
1059 uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b
1060 uint8x16_t idx_masked =
1061 vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
1062 #if defined(__aarch64__)
1063 return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
1064 #elif defined(__GNUC__)
1067 // %e and %f represent the even and odd D registers
1070 " vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
1071 " vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
1073 : [tbl] "w" (tbl), [idx] "w" (idx_masked));
1074 return vreinterpretq_m128i_s8(ret);
1076 // use this line if testing on aarch64
1077 int8x8x2_t a_split = { vget_low_s8(tbl), vget_high_s8(tbl) };
1078 return vreinterpretq_m128i_s8(
1080 vtbl2_s8(a_split, vget_low_u8(idx_masked)),
1081 vtbl2_s8(a_split, vget_high_u8(idx_masked))
1088 #if 0 /* C version */
1089 FORCE_INLINE __m128i _mm_shuffle_epi32_default(__m128i a,
1090 __constrange(0, 255) int imm)
1093 ret[0] = a[imm & 0x3];
1094 ret[1] = a[(imm >> 2) & 0x3];
1095 ret[2] = a[(imm >> 4) & 0x03];
1096 ret[3] = a[(imm >> 6) & 0x03];
1100 #define _mm_shuffle_epi32_default(a, imm) \
1103 ret = vmovq_n_s32( \
1104 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) &0x3)); \
1105 ret = vsetq_lane_s32( \
1106 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
1108 ret = vsetq_lane_s32( \
1109 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
1111 ret = vsetq_lane_s32( \
1112 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
1114 vreinterpretq_m128i_s32(ret); \
1117 // FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
1119 #if defined(__aarch64__)
1120 #define _mm_shuffle_epi32_splat(a, imm) \
1122 vreinterpretq_m128i_s32( \
1123 vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
1126 #define _mm_shuffle_epi32_splat(a, imm) \
1128 vreinterpretq_m128i_s32( \
1129 vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
1133 // Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
1134 // https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
1135 // FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, __constrange(0,255) int
1137 #if defined(__clang__)
1138 #define _mm_shuffle_epi32(a, imm) \
1140 int32x4_t _input = vreinterpretq_s32_m128i(a); \
1142 __builtin_shufflevector(_input, _input, \
1143 (imm) & 0x3, ((imm) >> 2) & 0x3, \
1144 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
1145 vreinterpretq_m128i_s32(_shuf); \
1148 #define _mm_shuffle_epi32(a, imm) \
1152 case _MM_SHUFFLE(1, 0, 3, 2): \
1153 ret = _mm_shuffle_epi_1032((a)); \
1155 case _MM_SHUFFLE(2, 3, 0, 1): \
1156 ret = _mm_shuffle_epi_2301((a)); \
1158 case _MM_SHUFFLE(0, 3, 2, 1): \
1159 ret = _mm_shuffle_epi_0321((a)); \
1161 case _MM_SHUFFLE(2, 1, 0, 3): \
1162 ret = _mm_shuffle_epi_2103((a)); \
1164 case _MM_SHUFFLE(1, 0, 1, 0): \
1165 ret = _mm_shuffle_epi_1010((a)); \
1167 case _MM_SHUFFLE(1, 0, 0, 1): \
1168 ret = _mm_shuffle_epi_1001((a)); \
1170 case _MM_SHUFFLE(0, 1, 0, 1): \
1171 ret = _mm_shuffle_epi_0101((a)); \
1173 case _MM_SHUFFLE(2, 2, 1, 1): \
1174 ret = _mm_shuffle_epi_2211((a)); \
1176 case _MM_SHUFFLE(0, 1, 2, 2): \
1177 ret = _mm_shuffle_epi_0122((a)); \
1179 case _MM_SHUFFLE(3, 3, 3, 2): \
1180 ret = _mm_shuffle_epi_3332((a)); \
1182 case _MM_SHUFFLE(0, 0, 0, 0): \
1183 ret = _mm_shuffle_epi32_splat((a), 0); \
1185 case _MM_SHUFFLE(1, 1, 1, 1): \
1186 ret = _mm_shuffle_epi32_splat((a), 1); \
1188 case _MM_SHUFFLE(2, 2, 2, 2): \
1189 ret = _mm_shuffle_epi32_splat((a), 2); \
1191 case _MM_SHUFFLE(3, 3, 3, 3): \
1192 ret = _mm_shuffle_epi32_splat((a), 3); \
1195 ret = _mm_shuffle_epi32_default((a), (imm)); \
1202 // Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
1204 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
1205 // FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
1206 // __constrange(0,255) int imm)
1208 #define _mm_shufflelo_epi16_function(a, imm) \
1210 int16x8_t ret = vreinterpretq_s16_m128i(a); \
1211 int16x4_t lowBits = vget_low_s16(ret); \
1212 ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) &0x3), ret, 0); \
1213 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
1215 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
1217 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
1219 vreinterpretq_m128i_s16(ret); \
1222 // FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a, __constrange(0,255) int
1224 #if defined(__clang__)
1225 #define _mm_shufflelo_epi16(a, imm) \
1227 int16x8_t _input = vreinterpretq_s16_m128i(a); \
1229 __builtin_shufflevector(_input, _input, \
1231 (((imm) >> 2) & 0x3), \
1232 (((imm) >> 4) & 0x3), \
1233 (((imm) >> 6) & 0x3), \
1235 vreinterpretq_m128i_s16(_shuf); \
1238 #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
1241 // Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
1243 // https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
1244 // FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
1245 // __constrange(0,255) int imm)
1246 #define _mm_shufflehi_epi16_function(a, imm) \
1248 int16x8_t ret = vreinterpretq_s16_m128i(a); \
1249 int16x4_t highBits = vget_high_s16(ret); \
1250 ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) &0x3), ret, 4); \
1251 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
1253 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
1255 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
1257 vreinterpretq_m128i_s16(ret); \
1260 // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, __constrange(0,255) int
1262 #if defined(__clang__)
1263 #define _mm_shufflehi_epi16(a, imm) \
1265 int16x8_t _input = vreinterpretq_s16_m128i(a); \
1267 __builtin_shufflevector(_input, _input, \
1269 ((imm) & 0x3) + 4, \
1270 (((imm) >> 2) & 0x3) + 4, \
1271 (((imm) >> 4) & 0x3) + 4, \
1272 (((imm) >> 6) & 0x3) + 4); \
1273 vreinterpretq_m128i_s16(_shuf); \
1276 #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
1279 // Blend packed 16-bit integers from a and b using control mask imm8, and store
1280 // the results in dst.
1285 // dst[i+15:i] := b[i+15:i]
1287 // dst[i+15:i] := a[i+15:i]
1290 // FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b, __constrange(0,255)
1292 #define _mm_blend_epi16(a, b, imm) \
1294 const uint16_t _mask[8] = { \
1295 ((imm) & (1 << 0)) ? 0xFFFF : 0x0000, \
1296 ((imm) & (1 << 1)) ? 0xFFFF : 0x0000, \
1297 ((imm) & (1 << 2)) ? 0xFFFF : 0x0000, \
1298 ((imm) & (1 << 3)) ? 0xFFFF : 0x0000, \
1299 ((imm) & (1 << 4)) ? 0xFFFF : 0x0000, \
1300 ((imm) & (1 << 5)) ? 0xFFFF : 0x0000, \
1301 ((imm) & (1 << 6)) ? 0xFFFF : 0x0000, \
1302 ((imm) & (1 << 7)) ? 0xFFFF : 0x0000 \
1304 uint16x8_t _mask_vec = vld1q_u16(_mask); \
1305 uint16x8_t _a = vreinterpretq_u16_m128i(a); \
1306 uint16x8_t _b = vreinterpretq_u16_m128i(b); \
1307 vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \
1310 // Blend packed 8-bit integers from a and b using mask, and store the results in dst.
1315 // dst[i+7:i] := b[i+7:i]
1317 // dst[i+7:i] := a[i+7:i]
1320 FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
1322 // Use a signed shift right to create a mask with the sign bit
1323 uint8x16_t mask = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
1324 uint8x16_t a = vreinterpretq_u8_m128i(_a);
1325 uint8x16_t b = vreinterpretq_u8_m128i(_b);
1326 return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
1329 /////////////////////////////////////
1331 /////////////////////////////////////
1333 // Shifts the 4 signed 32-bit integers in a right by count bits while shifting
1336 // r0 := a0 >> count
1337 // r1 := a1 >> count
1338 // r2 := a2 >> count
1339 // r3 := a3 >> count immediate
1340 FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, int count)
1342 return (__m128i) vshlq_s32((int32x4_t) a, vdupq_n_s32(-count));
1345 // Shifts the 8 signed 16-bit integers in a right by count bits while shifting
1348 // r0 := a0 >> count
1349 // r1 := a1 >> count
1351 // r7 := a7 >> count
1352 FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int count)
1354 return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
1357 // Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
1358 // shifting in zeros.
1360 // r0 := a0 << count
1361 // r1 := a1 << count
1363 // r7 := a7 << count
1365 // https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
1366 #define _mm_slli_epi16(a, imm) \
1371 } else if ((imm) > 31) { \
1372 ret = _mm_setzero_si128(); \
1374 ret = vreinterpretq_m128i_s16( \
1375 vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
1380 // Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
1381 // shifting in zeros. :
1382 // https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
1383 // FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
1384 #define _mm_slli_epi32(a, imm) \
1389 } else if ((imm) > 31) { \
1390 ret = _mm_setzero_si128(); \
1392 ret = vreinterpretq_m128i_s32( \
1393 vshlq_n_s32(vreinterpretq_s32_m128i(a), (imm))); \
1398 // Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
1399 // store the results in dst.
1400 #define _mm_slli_epi64(a, imm) \
1405 } else if ((imm) > 63) { \
1406 ret = _mm_setzero_si128(); \
1408 ret = vreinterpretq_m128i_s64( \
1409 vshlq_n_s64(vreinterpretq_s64_m128i(a), (imm))); \
1414 // Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
1415 // while shifting in zeros.
1417 // r0 := srl(a0, count)
1418 // r1 := srl(a1, count)
1420 // r7 := srl(a7, count)
1422 // https://msdn.microsoft.com/en-us/library/6tcwd38t(v=vs.90).aspx
1423 #define _mm_srli_epi16(a, imm) \
1428 } else if ((imm) > 31) { \
1429 ret = _mm_setzero_si128(); \
1431 ret = vreinterpretq_m128i_u16( \
1432 vshrq_n_u16(vreinterpretq_u16_m128i(a), (imm))); \
1437 // Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
1438 // while shifting in zeros.
1439 // https://msdn.microsoft.com/en-us/library/w486zcfa(v=vs.100).aspx FORCE_INLINE
1440 // __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
1441 #define _mm_srli_epi32(a, imm) \
1446 } else if ((imm) > 31) { \
1447 ret = _mm_setzero_si128(); \
1449 ret = vreinterpretq_m128i_u32( \
1450 vshrq_n_u32(vreinterpretq_u32_m128i(a), (imm))); \
1455 // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
1456 // store the results in dst.
1457 #define _mm_srli_epi64(a, imm) \
1462 } else if ((imm) > 63) { \
1463 ret = _mm_setzero_si128(); \
1465 ret = vreinterpretq_m128i_u64( \
1466 vshrq_n_u64(vreinterpretq_u64_m128i(a), (imm))); \
1471 // Shifts the 4 signed 32 - bit integers in a right by count bits while shifting
1473 // https://msdn.microsoft.com/en-us/library/z1939387(v=vs.100).aspx
1474 // FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
1475 #define _mm_srai_epi32(a, imm) \
1480 } else if ((imm) > 31) { \
1481 ret = vreinterpretq_m128i_s32( \
1482 vshrq_n_s32(vreinterpretq_s32_m128i(a), 16)); \
1483 ret = vreinterpretq_m128i_s32( \
1484 vshrq_n_s32(vreinterpretq_s32_m128i(ret), 16)); \
1486 ret = vreinterpretq_m128i_s32( \
1487 vshrq_n_s32(vreinterpretq_s32_m128i(a), (imm))); \
1492 // Shifts the 128 - bit value in a right by imm bytes while shifting in
1493 // zeros.imm must be an immediate.
1495 // r := srl(a, imm*8)
1497 // https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
1498 // FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
1499 #define _mm_srli_si128(a, imm) \
1504 } else if ((imm) > 15) { \
1505 ret = _mm_setzero_si128(); \
1507 ret = vreinterpretq_m128i_s8( \
1508 vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
1513 // Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
1514 // must be an immediate.
1516 // r := a << (imm * 8)
1518 // https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
1519 // FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
1520 #define _mm_slli_si128(a, imm) \
1525 } else if ((imm) > 15) { \
1526 ret = _mm_setzero_si128(); \
1528 ret = vreinterpretq_m128i_s8(vextq_s8( \
1529 vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
1534 // Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
1535 // shifting in zeros.
1537 // r0 := a0 << count
1538 // r1 := a1 << count
1540 // r7 := a7 << count
1542 // https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
1543 FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
1545 uint64_t c = ((SIMDVec *) &count)->m128_u64[0];
1547 return _mm_setzero_si128();
1549 int16x8_t vc = vdupq_n_s16((int16_t) c);
1550 return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
1553 // NEON does not provide a version of this function.
1554 // Creates a 16-bit mask from the most significant bits of the 16 signed or
1555 // unsigned 8-bit integers in a and zero extends the upper bits.
1556 // https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
1557 FORCE_INLINE int _mm_movemask_epi8(__m128i a)
1559 // Use increasingly wide shifts+adds to collect the sign bits
1561 // Since the widening shifts would be rather confusing to follow in little endian, everything
1562 // will be illustrated in big endian order instead. This has a different result - the bits
1563 // would actually be reversed on a big endian machine.
1565 // Starting input (only half the elements are shown):
1566 // 89 ff 1d c0 00 10 99 33
1567 uint8x16_t input = vreinterpretq_u8_m128i(a);
1569 // Shift out everything but the sign bits with an unsigned shift right.
1571 // Bytes of the vector::
1572 // 89 ff 1d c0 00 10 99 33
1573 // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
1575 // 01 01 00 01 00 00 01 00
1577 // Bits of first important lane(s):
1582 uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
1584 // Merge the even lanes together with a 16-bit unsigned shift right + add.
1585 // 'xx' represents garbage data which will be ignored in the final result.
1586 // In the important bytes, the add functions like a binary OR.
1588 // 01 01 00 01 00 00 01 00
1589 // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
1591 // xx 03 xx 01 xx 00 xx 02
1593 // 00000001 00000001 (01 01)
1596 // xxxxxxxx xxxxxx11 (xx 03)
1597 uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
1599 // Repeat with a wider 32-bit shift + add.
1600 // xx 03 xx 01 xx 00 xx 02
1601 // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >> 14))
1603 // xx xx xx 0d xx xx xx 02
1605 // 00000011 00000001 (03 01)
1608 // xxxxxxxx xxxx1101 (xx 0d)
1609 uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
1611 // Last, an even wider 64-bit shift + add to get our result in the low 8 bit lanes.
1612 // xx xx xx 0d xx xx xx 02
1613 // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >> 28))
1615 // xx xx xx xx xx xx xx d2
1617 // 00001101 00000010 (0d 02)
1620 // xxxxxxxx 11010010 (xx d2)
1621 uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
1623 // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
1624 // xx xx xx xx xx xx xx d2
1625 // || return paired64[0]
1627 // Note: Little endian would return the correct value 4b (01001011) instead.
1628 return vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8);
1631 // NEON does not provide this method
1632 // Creates a 4-bit mask from the most significant bits of the four
1633 // single-precision, floating-point values.
1634 // https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
1635 FORCE_INLINE int _mm_movemask_ps(__m128 a)
1637 // Uses the exact same method as _mm_movemask_epi8, see that for details
1638 uint32x4_t input = vreinterpretq_u32_m128(a);
1639 // Shift out everything but the sign bits with a 32-bit unsigned shift right.
1640 uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
1641 // Merge the two pairs together with a 64-bit unsigned shift right + add.
1642 uint8x16_t paired = vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
1643 // Extract the result.
1644 return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
1647 // Compute the bitwise AND of 128 bits (representing integer data) in a and
1648 // mask, and return 1 if the result is zero, otherwise return 0.
1649 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros&expand=5871
1650 FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
1652 int64x2_t a_and_mask =
1653 vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
1654 return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0
1658 // ******************************************
1660 // ******************************************
1662 // Subtracts the four single-precision, floating-point values of a and b.
1669 // https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
1670 FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
1672 return vreinterpretq_m128_f32(
1673 vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1676 // Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
1677 // and store the results in dst.
1680 FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
1682 return vreinterpretq_m128i_s64(
1683 vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
1686 // Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
1687 // unsigned 32-bit integers of a.
1694 // https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
1695 FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
1697 return vreinterpretq_m128i_s32(
1698 vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1701 FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
1703 return vreinterpretq_m128i_s16(
1704 vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
1707 FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
1709 return vreinterpretq_m128i_s8(
1710 vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
1713 // Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
1714 // integers of a and saturates..
1715 // https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
1716 FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
1718 return vreinterpretq_m128i_u16(
1719 vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
1722 // Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
1723 // integers of a and saturates.
1725 // r0 := UnsignedSaturate(a0 - b0)
1726 // r1 := UnsignedSaturate(a1 - b1)
1728 // r15 := UnsignedSaturate(a15 - b15)
1730 // https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
1731 FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
1733 return vreinterpretq_m128i_u8(
1734 vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
1737 // Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
1738 // of a and saturates.
1740 // r0 := SignedSaturate(a0 - b0)
1741 // r1 := SignedSaturate(a1 - b1)
1743 // r7 := SignedSaturate(a7 - b7)
1744 FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
1746 return vreinterpretq_m128i_s16(
1747 vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
1750 FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
1752 return vreinterpretq_m128i_u16(
1753 vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
1756 // Negate packed 8-bit integers in a when the corresponding signed
1757 // 8-bit integer in b is negative, and store the results in dst.
1758 // Element in dst are zeroed out when the corresponding element
1764 // else if b[i] == 0
1770 FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
1772 int8x16_t a = vreinterpretq_s8_m128i(_a);
1773 int8x16_t b = vreinterpretq_s8_m128i(_b);
1775 int8x16_t zero = vdupq_n_s8(0);
1776 // signed shift right: faster than vclt
1777 // (b < 0) ? 0xFF : 0
1778 uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
1779 // (b == 0) ? 0xFF : 0
1780 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, zero));
1782 int8x16_t neg = vnegq_s8(a);
1783 // bitwise select either a or neg based on ltMask
1784 int8x16_t masked = vbslq_s8(ltMask, a, neg);
1785 // res = masked & (~zeroMask)
1786 int8x16_t res = vbicq_s8(masked, zeroMask);
1787 return vreinterpretq_m128i_s8(res);
1790 // Negate packed 16-bit integers in a when the corresponding signed
1791 // 16-bit integer in b is negative, and store the results in dst.
1792 // Element in dst are zeroed out when the corresponding element
1798 // else if b[i] == 0
1804 FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
1806 int16x8_t a = vreinterpretq_s16_m128i(_a);
1807 int16x8_t b = vreinterpretq_s16_m128i(_b);
1809 int16x8_t zero = vdupq_n_s16(0);
1810 // signed shift right: faster than vclt
1811 // (b < 0) ? 0xFFFF : 0
1812 uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
1813 // (b == 0) ? 0xFFFF : 0
1814 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, zero));
1816 int16x8_t neg = vnegq_s16(a);
1817 // bitwise select either a or neg based on ltMask
1818 int16x8_t masked = vbslq_s16(ltMask, a, neg);
1819 // res = masked & (~zeroMask)
1820 int16x8_t res = vbicq_s16(masked, zeroMask);
1821 return vreinterpretq_m128i_s16(res);
1824 // Negate packed 32-bit integers in a when the corresponding signed
1825 // 32-bit integer in b is negative, and store the results in dst.
1826 // Element in dst are zeroed out when the corresponding element
1832 // else if b[i] == 0
1838 FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
1840 int32x4_t a = vreinterpretq_s32_m128i(_a);
1841 int32x4_t b = vreinterpretq_s32_m128i(_b);
1843 int32x4_t zero = vdupq_n_s32(0);
1844 // signed shift right: faster than vclt
1845 // (b < 0) ? 0xFFFFFFFF : 0
1846 uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
1847 // (b == 0) ? 0xFFFFFFFF : 0
1848 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, zero));
1850 int32x4_t neg = vnegq_s32(a);
1851 // bitwise select either a or neg based on ltMask
1852 int32x4_t masked = vbslq_s32(ltMask, a, neg);
1853 // res = masked & (~zeroMask)
1854 int32x4_t res = vbicq_s32(masked, zeroMask);
1855 return vreinterpretq_m128i_s32(res);
1858 // Computes the average of the 16 unsigned 8-bit integers in a and the 16
1859 // unsigned 8-bit integers in b and rounds.
1861 // r0 := (a0 + b0) / 2
1862 // r1 := (a1 + b1) / 2
1864 // r15 := (a15 + b15) / 2
1866 // https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
1867 FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
1869 return vreinterpretq_m128i_u8(
1870 vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
1873 // Computes the average of the 8 unsigned 16-bit integers in a and the 8
1874 // unsigned 16-bit integers in b and rounds.
1876 // r0 := (a0 + b0) / 2
1877 // r1 := (a1 + b1) / 2
1879 // r7 := (a7 + b7) / 2
1881 // https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
1882 FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
1884 return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
1885 vreinterpretq_u16_m128i(b));
1888 // Adds the four single-precision, floating-point values of a and b.
1895 // https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
1896 FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
1898 return vreinterpretq_m128_f32(
1899 vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1902 // adds the scalar single-precision floating point values of a and b.
1903 // https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
1904 FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
1906 float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
1907 float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
1908 // the upper values in the result must be the remnants of <a>.
1909 return vreinterpretq_m128_f32(vaddq_f32(a, value));
1912 // Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
1913 // unsigned 32-bit integers in b.
1914 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
1915 FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
1917 return vreinterpretq_m128i_s64(
1918 vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
1921 // Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
1922 // unsigned 32-bit integers in b.
1929 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
1930 FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
1932 return vreinterpretq_m128i_s32(
1933 vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1936 // Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
1937 // unsigned 16-bit integers in b.
1938 // https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
1939 FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
1941 return vreinterpretq_m128i_s16(
1942 vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
1945 // Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
1946 // unsigned 8-bit integers in b.
1947 // https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
1948 FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
1950 return vreinterpretq_m128i_s8(
1951 vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
1954 // Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
1957 // r0 := SignedSaturate(a0 + b0)
1958 // r1 := SignedSaturate(a1 + b1)
1960 // r7 := SignedSaturate(a7 + b7)
1962 // https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
1963 FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
1965 return vreinterpretq_m128i_s16(
1966 vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
1969 // Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
1970 // b and saturates..
1971 // https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
1972 FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
1974 return vreinterpretq_m128i_u8(
1975 vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
1978 // Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
1979 // unsigned 16-bit integers from b.
1981 // r0 := (a0 * b0)[15:0]
1982 // r1 := (a1 * b1)[15:0]
1984 // r7 := (a7 * b7)[15:0]
1986 // https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
1987 FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
1989 return vreinterpretq_m128i_s16(
1990 vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
1993 // Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
1994 // unsigned 32-bit integers from b.
1995 // https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
1996 FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
1998 return vreinterpretq_m128i_s32(
1999 vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2002 // Multiplies the four single-precision, floating-point values of a and b.
2009 // https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
2010 FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
2012 return vreinterpretq_m128_f32(
2013 vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2016 // Multiply the low unsigned 32-bit integers from each packed 64-bit element in
2017 // a and b, and store the unsigned 64-bit results in dst.
2019 // r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
2020 // r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
2021 FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
2023 // vmull_u32 upcasts instead of masking, so we downcast.
2024 uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
2025 uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
2026 return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
2029 // Multiply the low signed 32-bit integers from each packed 64-bit element in
2030 // a and b, and store the signed 64-bit results in dst.
2032 // r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
2033 // r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
2034 FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
2036 // vmull_s32 upcasts instead of masking, so we downcast.
2037 int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
2038 int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
2039 return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
2042 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
2045 // r0 := (a0 * b0) + (a1 * b1)
2046 // r1 := (a2 * b2) + (a3 * b3)
2047 // r2 := (a4 * b4) + (a5 * b5)
2048 // r3 := (a6 * b6) + (a7 * b7)
2049 // https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
2050 FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
2052 int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
2053 vget_low_s16(vreinterpretq_s16_m128i(b)));
2054 int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
2055 vget_high_s16(vreinterpretq_s16_m128i(b)));
2057 int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
2058 int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
2060 return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
2063 // Multiply packed signed 16-bit integers in a and b, producing intermediate signed
2064 // 32-bit integers. Shift right by 15 bits while rounding up, and store the
2065 // packed 16-bit integers in dst.
2067 // r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
2068 // r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
2069 // r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
2071 // r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
2072 FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
2074 // Has issues due to saturation
2075 // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
2078 int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
2079 vget_low_s16(vreinterpretq_s16_m128i(b)));
2080 int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
2081 vget_high_s16(vreinterpretq_s16_m128i(b)));
2083 // Rounding narrowing shift right
2084 // narrow = (int16_t)((mul + 16384) >> 15);
2085 int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
2086 int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
2089 return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
2092 // Vertically multiply each unsigned 8-bit integer from a with the corresponding
2093 // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
2094 // Horizontally add adjacent pairs of intermediate signed 16-bit integers,
2095 // and pack the saturated results in dst.
2099 // dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
2101 FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
2103 // This would be much simpler if x86 would choose to zero extend OR sign extend,
2105 // This could probably be optimized better.
2106 uint16x8_t a = vreinterpretq_u16_m128i(_a);
2107 int16x8_t b = vreinterpretq_s16_m128i(_b);
2110 int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
2111 int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
2113 // Sign extend by shifting left then shifting right.
2114 int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
2115 int16x8_t b_odd = vshrq_n_s16(b, 8);
2118 int16x8_t prod1 = vmulq_s16(a_even, b_even);
2119 int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
2122 return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
2125 // Computes the absolute difference of the 16 unsigned 8-bit integers from a
2126 // and the 16 unsigned 8-bit integers from b.
2129 // Sums the upper 8 differences and lower 8 differences and packs the
2130 // resulting 2 unsigned 16-bit integers into the upper and lower 64-bit
2133 // r0 := abs(a0 - b0) + abs(a1 - b1) +...+ abs(a7 - b7)
2137 // r4 := abs(a8 - b8) + abs(a9 - b9) +...+ abs(a15 - b15)
2141 FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
2143 uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
2144 uint16_t r0 = t[0] + t[1] + t[2] + t[3];
2145 uint16_t r4 = t[4] + t[5] + t[6] + t[7];
2146 uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
2147 return (__m128i) vsetq_lane_u16(r4, r, 4);
2150 // Divides the four single-precision, floating-point values of a and b.
2157 // https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
2158 FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
2160 float32x4_t recip0 = vrecpeq_f32(vreinterpretq_f32_m128(b));
2161 float32x4_t recip1 =
2162 vmulq_f32(recip0, vrecpsq_f32(recip0, vreinterpretq_f32_m128(b)));
2163 return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip1));
2166 // Divides the scalar single-precision floating point value of a by b.
2167 // https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
2168 FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
2171 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
2172 return vreinterpretq_m128_f32(
2173 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2176 // This version does additional iterations to improve accuracy. Between 1 and 4
2177 // recommended. Computes the approximations of reciprocals of the four
2178 // single-precision, floating-point values of a.
2179 // https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
2180 FORCE_INLINE __m128 recipq_newton(__m128 in, int n)
2183 float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
2184 for (i = 0; i < n; ++i) {
2186 vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2188 return vreinterpretq_m128_f32(recip);
2191 // Computes the approximations of reciprocals of the four single-precision,
2192 // floating-point values of a.
2193 // https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
2194 FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
2196 float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
2197 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2198 return vreinterpretq_m128_f32(recip);
2201 // Computes the approximations of square roots of the four single-precision,
2202 // floating-point values of a. First computes reciprocal square roots and then
2203 // reciprocals of the four values.
2210 // https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
2211 FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
2213 float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2214 float32x4_t sq = vrecpeq_f32(recipsq);
2215 // ??? use step versions of both sqrt and recip for better accuracy?
2216 return vreinterpretq_m128_f32(sq);
2219 // Computes the approximation of the square root of the scalar single-precision
2220 // floating point value of in.
2221 // https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
2222 FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
2225 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
2226 return vreinterpretq_m128_f32(
2227 vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
2230 // Computes the approximations of the reciprocal square roots of the four
2231 // single-precision floating point values of in.
2232 // https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
2233 FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
2235 return vreinterpretq_m128_f32(vrsqrteq_f32(vreinterpretq_f32_m128(in)));
2238 // Computes the maximums of the four single-precision, floating-point values of
2240 // https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
2241 FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
2243 return vreinterpretq_m128_f32(
2244 vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2247 // Computes the minima of the four single-precision, floating-point values of a
2249 // https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
2250 FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
2252 return vreinterpretq_m128_f32(
2253 vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2256 // Computes the maximum of the two lower scalar single-precision floating point
2257 // values of a and b.
2258 // https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
2259 FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
2261 float32_t value = vgetq_lane_f32(
2262 vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
2263 return vreinterpretq_m128_f32(
2264 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2267 // Computes the minimum of the two lower scalar single-precision floating point
2268 // values of a and b.
2269 // https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
2270 FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
2272 float32_t value = vgetq_lane_f32(
2273 vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
2274 return vreinterpretq_m128_f32(
2275 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2278 // Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
2279 // 16 unsigned 8-bit integers from b.
2280 // https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
2281 FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
2283 return vreinterpretq_m128i_u8(
2284 vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2287 // Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
2288 // 16 unsigned 8-bit integers from b.
2289 // https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
2290 FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
2292 return vreinterpretq_m128i_u8(
2293 vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2296 // Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
2297 // signed 16-bit integers from b.
2298 // https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
2299 FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
2301 return vreinterpretq_m128i_s16(
2302 vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2305 // Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
2306 // signed 16-bit integers from b.
2307 // https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
2308 FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
2310 return vreinterpretq_m128i_s16(
2311 vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2314 // epi versions of min/max
2315 // Computes the pariwise maximums of the four signed 32-bit integer values of a
2318 // A 128-bit parameter that can be defined with the following equations:
2319 // r0 := (a0 > b0) ? a0 : b0
2320 // r1 := (a1 > b1) ? a1 : b1
2321 // r2 := (a2 > b2) ? a2 : b2
2322 // r3 := (a3 > b3) ? a3 : b3
2324 // https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
2325 FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
2327 return vreinterpretq_m128i_s32(
2328 vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2331 // Computes the pariwise minima of the four signed 32-bit integer values of a
2334 // A 128-bit parameter that can be defined with the following equations:
2335 // r0 := (a0 < b0) ? a0 : b0
2336 // r1 := (a1 < b1) ? a1 : b1
2337 // r2 := (a2 < b2) ? a2 : b2
2338 // r3 := (a3 < b3) ? a3 : b3
2340 // https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
2341 FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
2343 return vreinterpretq_m128i_s32(
2344 vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2347 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
2350 // r0 := (a0 * b0)[31:16]
2351 // r1 := (a1 * b1)[31:16]
2353 // r7 := (a7 * b7)[31:16]
2355 // https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
2356 FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
2358 /* FIXME: issue with large values because of result saturation */
2359 // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
2360 // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
2361 // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
2362 int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
2363 int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
2364 int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
2365 int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
2366 int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
2367 int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
2369 vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
2370 return vreinterpretq_m128i_u16(r.val[1]);
2373 // Computes pairwise add of each argument as single-precision, floating-point
2375 // https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
2376 FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
2378 #if defined(__aarch64__)
2379 return vreinterpretq_m128_f32(vpaddq_f32(
2380 vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); // AArch64
2382 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
2383 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
2384 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
2385 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
2386 return vreinterpretq_m128_f32(
2387 vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
2391 // Computes pairwise add of each argument as a 16-bit signed or unsigned integer
2393 FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
2395 int16x8_t a = vreinterpretq_s16_m128i(_a);
2396 int16x8_t b = vreinterpretq_s16_m128i(_b);
2397 #if defined(__aarch64__)
2398 return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
2400 return vreinterpretq_m128i_s16(
2402 vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
2403 vpadd_s16(vget_low_s16(b), vget_high_s16(b))
2409 // Computes pairwise difference of each argument as a 16-bit signed or unsigned integer
2411 FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
2413 int32x4_t a = vreinterpretq_s32_m128i(_a);
2414 int32x4_t b = vreinterpretq_s32_m128i(_b);
2415 // Interleave using vshrn/vmovn
2416 // [a0|a2|a4|a6|b0|b2|b4|b6]
2417 // [a1|a3|a5|a7|b1|b3|b5|b7]
2418 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
2419 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
2421 return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
2424 // Computes saturated pairwise sub of each argument as a 16-bit signed
2425 // integer values a and b.
2426 FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
2428 int32x4_t a = vreinterpretq_s32_m128i(_a);
2429 int32x4_t b = vreinterpretq_s32_m128i(_b);
2430 // Interleave using vshrn/vmovn
2431 // [a0|a2|a4|a6|b0|b2|b4|b6]
2432 // [a1|a3|a5|a7|b1|b3|b5|b7]
2433 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
2434 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
2436 return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
2439 // Computes saturated pairwise difference of each argument as a 16-bit signed
2440 // integer values a and b.
2441 FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
2443 int32x4_t a = vreinterpretq_s32_m128i(_a);
2444 int32x4_t b = vreinterpretq_s32_m128i(_b);
2445 // Interleave using vshrn/vmovn
2446 // [a0|a2|a4|a6|b0|b2|b4|b6]
2447 // [a1|a3|a5|a7|b1|b3|b5|b7]
2448 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
2449 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
2450 // Saturated subtract
2451 return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
2454 // Computes pairwise add of each argument as a 32-bit signed or unsigned integer
2456 FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
2458 int32x4_t a = vreinterpretq_s32_m128i(_a);
2459 int32x4_t b = vreinterpretq_s32_m128i(_b);
2460 return vreinterpretq_m128i_s32(
2462 vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
2463 vpadd_s32(vget_low_s32(b), vget_high_s32(b))
2468 // Computes pairwise difference of each argument as a 32-bit signed or unsigned integer
2470 FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
2472 int64x2_t a = vreinterpretq_s64_m128i(_a);
2473 int64x2_t b = vreinterpretq_s64_m128i(_b);
2474 // Interleave using vshrn/vmovn
2477 int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
2478 int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
2480 return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
2483 // ******************************************
2484 // Compare operations
2485 // ******************************************
2487 // Compares for less than
2488 // https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
2489 FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
2491 return vreinterpretq_m128_u32(
2492 vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2495 // Compares for greater than.
2497 // r0 := (a0 > b0) ? 0xffffffff : 0x0
2498 // r1 := (a1 > b1) ? 0xffffffff : 0x0
2499 // r2 := (a2 > b2) ? 0xffffffff : 0x0
2500 // r3 := (a3 > b3) ? 0xffffffff : 0x0
2502 // https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
2503 FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
2505 return vreinterpretq_m128_u32(
2506 vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2509 // Compares for greater than or equal.
2510 // https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
2511 FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
2513 return vreinterpretq_m128_u32(
2514 vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2517 // Compares for less than or equal.
2519 // r0 := (a0 <= b0) ? 0xffffffff : 0x0
2520 // r1 := (a1 <= b1) ? 0xffffffff : 0x0
2521 // r2 := (a2 <= b2) ? 0xffffffff : 0x0
2522 // r3 := (a3 <= b3) ? 0xffffffff : 0x0
2524 // https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
2525 FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
2527 return vreinterpretq_m128_u32(
2528 vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2531 // Compares for equality.
2532 // https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
2533 FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
2535 return vreinterpretq_m128_u32(
2536 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2539 // Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
2540 // unsigned 8-bit integers in b for equality.
2541 // https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
2542 FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
2544 return vreinterpretq_m128i_u8(
2545 vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2548 // Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
2549 // unsigned 16-bit integers in b for equality.
2550 // https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
2551 FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
2553 return vreinterpretq_m128i_u16(
2554 vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2557 // Compare packed 32-bit integers in a and b for equality, and store the results
2559 FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
2561 return vreinterpretq_m128i_u32(
2562 vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2565 // Compare packed 64-bit integers in a and b for equality, and store the results
2567 FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
2569 #if defined(__aarch64__)
2570 return vreinterpretq_m128i_u64(
2571 vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
2573 // ARMv7 lacks vceqq_u64
2574 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
2575 uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
2576 uint32x4_t swapped = vrev64q_u32(cmp);
2577 return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
2581 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
2582 // in b for lesser than.
2583 // https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
2584 FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
2586 return vreinterpretq_m128i_u8(
2587 vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2590 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
2591 // in b for greater than.
2593 // r0 := (a0 > b0) ? 0xff : 0x0
2594 // r1 := (a1 > b1) ? 0xff : 0x0
2596 // r15 := (a15 > b15) ? 0xff : 0x0
2598 // https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
2599 FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
2601 return vreinterpretq_m128i_u8(
2602 vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2605 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
2606 // in b for less than.
2608 // r0 := (a0 < b0) ? 0xffff : 0x0
2609 // r1 := (a1 < b1) ? 0xffff : 0x0
2611 // r7 := (a7 < b7) ? 0xffff : 0x0
2613 // https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
2614 FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
2616 return vreinterpretq_m128i_u16(
2617 vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2620 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
2621 // in b for greater than.
2623 // r0 := (a0 > b0) ? 0xffff : 0x0
2624 // r1 := (a1 > b1) ? 0xffff : 0x0
2626 // r7 := (a7 > b7) ? 0xffff : 0x0
2628 // https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
2629 FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
2631 return vreinterpretq_m128i_u16(
2632 vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2636 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
2637 // in b for less than.
2638 // https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
2639 FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
2641 return vreinterpretq_m128i_u32(
2642 vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2645 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
2646 // in b for greater than.
2647 // https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
2648 FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
2650 return vreinterpretq_m128i_u32(
2651 vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2654 // Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
2655 // in b for greater than.
2656 FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
2658 #if defined(__aarch64__)
2659 return vreinterpretq_m128i_u64(
2660 vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2662 // ARMv7 lacks vcgtq_s64.
2663 // This is based off of Clang's SSE2 polyfill:
2664 // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi))
2666 // Mask the sign bit out since we need a signed AND an unsigned comparison
2667 // and it is ugly to try and split them.
2668 int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull));
2669 int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask);
2670 int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask);
2672 int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask));
2673 // Copy upper mask to lower mask
2675 int64x2_t gt_hi = vshrq_n_s64(greater, 63);
2676 // Copy lower mask to upper mask
2678 int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32);
2679 // Compare for equality
2680 int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask));
2681 // Copy upper mask to lower mask
2683 int64x2_t eq_hi = vshrq_n_s64(equal, 63);
2684 // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi)
2685 int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi));
2686 return vreinterpretq_m128i_s64(ret);
2689 // Compares the four 32-bit floats in a and b to check if any values are NaN.
2690 // Ordered compare between each value returns true for "orderable" and false for
2691 // "not orderable" (NaN).
2692 // https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
2694 // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
2695 // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
2696 FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
2698 // Note: NEON does not have ordered compare builtin
2699 // Need to compare a eq a and b eq b to check for NaN
2700 // Do AND of results to get final
2702 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
2704 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
2705 return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
2708 // Compares the lower single-precision floating point scalar values of a and b
2709 // using a less than operation. :
2710 // https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
2711 // note!! The documentation on MSDN is incorrect! If either of the values is a
2712 // NAN the docs say you will get a one, but in fact, it will return a zero!!
2713 FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
2715 uint32x4_t a_not_nan =
2716 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
2717 uint32x4_t b_not_nan =
2718 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
2719 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
2721 vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
2722 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0;
2725 // Compares the lower single-precision floating point scalar values of a and b
2726 // using a greater than operation. :
2727 // https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
2728 FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
2730 // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
2731 // vreinterpretq_f32_m128(b)), 0);
2732 uint32x4_t a_not_nan =
2733 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
2734 uint32x4_t b_not_nan =
2735 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
2736 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
2738 vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
2739 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
2742 // Compares the lower single-precision floating point scalar values of a and b
2743 // using a less than or equal operation. :
2744 // https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
2745 FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
2747 // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
2748 // vreinterpretq_f32_m128(b)), 0);
2749 uint32x4_t a_not_nan =
2750 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
2751 uint32x4_t b_not_nan =
2752 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
2753 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
2755 vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
2756 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0;
2759 // Compares the lower single-precision floating point scalar values of a and b
2760 // using a greater than or equal operation. :
2761 // https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
2762 FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
2764 // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
2765 // vreinterpretq_f32_m128(b)), 0);
2766 uint32x4_t a_not_nan =
2767 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
2768 uint32x4_t b_not_nan =
2769 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
2770 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
2772 vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
2773 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
2776 // Compares the lower single-precision floating point scalar values of a and b
2777 // using an equality operation. :
2778 // https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
2779 FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
2781 // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
2782 // vreinterpretq_f32_m128(b)), 0);
2783 uint32x4_t a_not_nan =
2784 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
2785 uint32x4_t b_not_nan =
2786 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
2787 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
2789 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
2790 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0;
2793 // Compares the lower single-precision floating point scalar values of a and b
2794 // using an inequality operation. :
2795 // https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
2796 FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
2798 // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
2799 // vreinterpretq_f32_m128(b)), 0);
2800 uint32x4_t a_not_nan =
2801 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
2802 uint32x4_t b_not_nan =
2803 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
2804 uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
2805 uint32x4_t a_neq_b = vmvnq_u32(
2806 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2807 return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0;
2810 // according to the documentation, these intrinsics behave the same as the
2811 // non-'u' versions. We'll just alias them here.
2812 #define _mm_ucomilt_ss _mm_comilt_ss
2813 #define _mm_ucomile_ss _mm_comile_ss
2814 #define _mm_ucomigt_ss _mm_comigt_ss
2815 #define _mm_ucomige_ss _mm_comige_ss
2816 #define _mm_ucomieq_ss _mm_comieq_ss
2817 #define _mm_ucomineq_ss _mm_comineq_ss
2819 // ******************************************
2821 // ******************************************
2823 // Converts the four single-precision, floating-point values of a to signed
2824 // 32-bit integer values using truncate.
2825 // https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
2826 FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
2828 return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
2831 // Converts the four signed 32-bit integer values of a to single-precision,
2832 // floating-point values
2833 // https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
2834 FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
2836 return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
2839 // Converts the four unsigned 8-bit integers in the lower 16 bits to four
2840 // unsigned 32-bit integers.
2841 FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
2843 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
2844 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
2845 return vreinterpretq_m128i_u16(u16x8);
2848 // Converts the four unsigned 8-bit integers in the lower 32 bits to four
2849 // unsigned 32-bit integers.
2850 // https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
2851 FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
2853 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
2854 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
2855 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
2856 return vreinterpretq_m128i_u32(u32x4);
2859 // Converts the two unsigned 8-bit integers in the lower 16 bits to two
2860 // unsigned 64-bit integers.
2861 FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
2863 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */
2864 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
2865 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
2866 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
2867 return vreinterpretq_m128i_u64(u64x2);
2870 // Converts the four unsigned 8-bit integers in the lower 16 bits to four
2871 // unsigned 32-bit integers.
2872 FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
2874 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
2875 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
2876 return vreinterpretq_m128i_s16(s16x8);
2879 // Converts the four unsigned 8-bit integers in the lower 32 bits to four
2880 // unsigned 32-bit integers.
2881 FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
2883 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
2884 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
2885 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
2886 return vreinterpretq_m128i_s32(s32x4);
2889 // Converts the two signed 8-bit integers in the lower 32 bits to four
2890 // signed 64-bit integers.
2891 FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
2893 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */
2894 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
2895 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
2896 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
2897 return vreinterpretq_m128i_s64(s64x2);
2900 // Converts the four signed 16-bit integers in the lower 64 bits to four signed
2902 FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
2904 return vreinterpretq_m128i_s32(
2905 vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
2908 // Converts the two signed 16-bit integers in the lower 32 bits two signed
2910 FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
2912 int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */
2913 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
2914 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
2915 return vreinterpretq_m128i_s64(s64x2);
2918 // Converts the four unsigned 16-bit integers in the lower 64 bits to four unsigned
2920 FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
2922 return vreinterpretq_m128i_u32(
2923 vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
2926 // Converts the two unsigned 16-bit integers in the lower 32 bits to two unsigned
2928 FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
2930 uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */
2931 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
2932 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
2933 return vreinterpretq_m128i_u64(u64x2);
2936 // Converts the two unsigned 32-bit integers in the lower 64 bits to two unsigned
2938 FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
2940 return vreinterpretq_m128i_u64(
2941 vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
2944 // Converts the two signed 32-bit integers in the lower 64 bits to two signed
2946 FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
2948 return vreinterpretq_m128i_s64(
2949 vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
2952 // Converts the four single-precision, floating-point values of a to signed
2953 // 32-bit integer values.
2960 // https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
2961 // *NOTE*. The default rounding mode on SSE is 'round to even', which ArmV7-A
2962 // does not support! It is supported on ARMv8-A however.
2963 FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
2965 #if defined(__aarch64__)
2966 return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
2968 uint32x4_t signmask = vdupq_n_u32(0x80000000);
2969 float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
2970 vdupq_n_f32(0.5f)); /* +/- 0.5 */
2971 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
2972 vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
2974 vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
2975 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
2976 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
2977 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
2978 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
2979 float32x4_t delta = vsubq_f32(
2980 vreinterpretq_f32_m128(a),
2981 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
2982 uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
2983 return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal));
2987 // Moves the least significant 32 bits of a to a 32-bit integer.
2988 // https://msdn.microsoft.com/en-us/library/5z7a9642%28v=vs.90%29.aspx
2989 FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
2991 return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
2994 // Extracts the low order 64-bit integer from the parameter.
2995 // https://msdn.microsoft.com/en-us/library/bb531384(v=vs.120).aspx
2996 FORCE_INLINE uint64_t _mm_cvtsi128_si64(__m128i a)
2998 return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
3001 // Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
3002 // zero extending the upper bits.
3009 // https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
3010 FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
3012 return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
3015 // Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
3016 // zero extending the upper bits.
3020 FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
3022 return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
3025 // Applies a type cast to reinterpret four 32-bit floating point values passed
3026 // in as a 128-bit parameter as packed 32-bit integers.
3027 // https://msdn.microsoft.com/en-us/library/bb514099.aspx
3028 FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
3030 return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
3033 // Applies a type cast to reinterpret four 32-bit integers passed in as a
3034 // 128-bit parameter as packed 32-bit floating point values.
3035 // https://msdn.microsoft.com/en-us/library/bb514029.aspx
3036 FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
3038 return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
3041 // Loads 128-bit value. :
3042 // https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
3043 FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
3045 return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
3048 // Loads 128-bit value. :
3049 // https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
3050 FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
3052 return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
3055 // _mm_lddqu_si128 functions the same as _mm_loadu_si128.
3056 #define _mm_lddqu_si128 _mm_loadu_si128
3058 // ******************************************
3059 // Miscellaneous Operations
3060 // ******************************************
3063 // Shifts the 8 signed 16-bit integers in a right by count bits while shifting
3066 // r0 := a0 >> count
3067 // r1 := a1 >> count
3069 // r7 := a7 >> count
3071 // https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
3072 FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
3074 int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
3076 return _mm_cmplt_epi16(a, _mm_setzero_si128());
3077 return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
3080 // Shifts the 4 signed 32-bit integers in a right by count bits while shifting
3083 // r0 := a0 >> count
3084 // r1 := a1 >> count
3085 // r2 := a2 >> count
3086 // r3 := a3 >> count
3088 // https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
3089 FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
3091 int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
3093 return _mm_cmplt_epi32(a, _mm_setzero_si128());
3094 return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
3097 // Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
3099 // https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
3100 FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
3102 return vreinterpretq_m128i_s8(
3103 vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
3104 vqmovn_s16(vreinterpretq_s16_m128i(b))));
3107 // Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
3108 // integers and saturates.
3110 // r0 := UnsignedSaturate(a0)
3111 // r1 := UnsignedSaturate(a1)
3113 // r7 := UnsignedSaturate(a7)
3114 // r8 := UnsignedSaturate(b0)
3115 // r9 := UnsignedSaturate(b1)
3117 // r15 := UnsignedSaturate(b7)
3119 // https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
3120 FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
3122 return vreinterpretq_m128i_u8(
3123 vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
3124 vqmovun_s16(vreinterpretq_s16_m128i(b))));
3127 // Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
3130 // r0 := SignedSaturate(a0)
3131 // r1 := SignedSaturate(a1)
3132 // r2 := SignedSaturate(a2)
3133 // r3 := SignedSaturate(a3)
3134 // r4 := SignedSaturate(b0)
3135 // r5 := SignedSaturate(b1)
3136 // r6 := SignedSaturate(b2)
3137 // r7 := SignedSaturate(b3)
3139 // https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
3140 FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
3142 return vreinterpretq_m128i_s16(
3143 vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
3144 vqmovn_s32(vreinterpretq_s32_m128i(b))));
3147 // Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit integers
3150 // r0 := UnsignedSaturate(a0)
3151 // r1 := UnsignedSaturate(a1)
3152 // r2 := UnsignedSaturate(a2)
3153 // r3 := UnsignedSaturate(a3)
3154 // r4 := UnsignedSaturate(b0)
3155 // r5 := UnsignedSaturate(b1)
3156 // r6 := UnsignedSaturate(b2)
3157 // r7 := UnsignedSaturate(b3)
3158 FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
3160 return vreinterpretq_m128i_u16(
3161 vcombine_u16(vqmovn_u32(vreinterpretq_u32_m128i(a)),
3162 vqmovn_u32(vreinterpretq_u32_m128i(b))));
3165 // Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
3166 // 8 signed or unsigned 8-bit integers in b.
3176 // https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
3177 FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
3179 #if defined(__aarch64__)
3180 return vreinterpretq_m128i_s8(vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3182 int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
3183 int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
3184 int8x8x2_t result = vzip_s8(a1, b1);
3185 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
3189 // Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
3190 // lower 4 signed or unsigned 16-bit integers in b.
3201 // https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
3202 FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
3204 #if defined(__aarch64__)
3205 return vreinterpretq_m128i_s16(vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3207 int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
3208 int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
3209 int16x4x2_t result = vzip_s16(a1, b1);
3210 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
3214 // Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
3215 // lower 2 signed or unsigned 32 - bit integers in b.
3222 // https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
3223 FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
3225 #if defined(__aarch64__)
3226 return vreinterpretq_m128i_s32(vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3228 int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
3229 int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
3230 int32x2x2_t result = vzip_s32(a1, b1);
3231 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
3235 FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
3237 int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
3238 int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
3239 return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
3242 // Selects and interleaves the lower two single-precision, floating-point values
3250 // https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
3251 FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
3253 #if defined(__aarch64__)
3254 return vreinterpretq_m128_f32(vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3256 float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
3257 float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
3258 float32x2x2_t result = vzip_f32(a1, b1);
3259 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
3263 // Selects and interleaves the upper two single-precision, floating-point values
3271 // https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
3272 FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
3274 #if defined(__aarch64__)
3275 return vreinterpretq_m128_f32(vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3277 float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
3278 float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
3279 float32x2x2_t result = vzip_f32(a1, b1);
3280 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
3284 // Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
3285 // 8 signed or unsigned 8-bit integers in b.
3295 // https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
3296 FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
3298 #if defined(__aarch64__)
3299 return vreinterpretq_m128i_s8(vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3302 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
3304 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
3305 int8x8x2_t result = vzip_s8(a1, b1);
3306 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
3310 // Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
3311 // upper 4 signed or unsigned 16-bit integers in b.
3322 // https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
3323 FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
3325 #if defined(__aarch64__)
3326 return vreinterpretq_m128i_s16(vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3328 int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
3329 int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
3330 int16x4x2_t result = vzip_s16(a1, b1);
3331 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
3335 // Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
3336 // upper 2 signed or unsigned 32-bit integers in b.
3337 // https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
3338 FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
3340 #if defined(__aarch64__)
3341 return vreinterpretq_m128i_s32(vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3343 int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
3344 int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
3345 int32x2x2_t result = vzip_s32(a1, b1);
3346 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
3350 // Interleaves the upper signed or unsigned 64-bit integer in a with the
3351 // upper signed or unsigned 64-bit integer in b.
3355 FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
3357 int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
3358 int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
3359 return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
3363 // https://msdn.microsoft.com/en-us/library/bb514041(v=vs.120).aspx
3364 // http://blog.csdn.net/hemmingway/article/details/44828303
3365 // Clang requires a macro here, as it is extremely picky about c being a literal.
3366 #define _mm_alignr_epi8(a, b, c) ((__m128i) vextq_s8((int8x16_t) (b), (int8x16_t) (a), (c)))
3368 // Extracts the selected signed or unsigned 8-bit integer from a and zero
3370 // FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
3371 #define _mm_extract_epi8(a, imm) \
3372 vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
3374 // Inserts the least significant 8 bits of b into the selected 8-bit integer
3376 // FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, const int b,
3377 // __constrange(0,16) int imm)
3378 #define _mm_insert_epi8(a, b, imm) \
3380 vreinterpretq_m128i_s8( \
3381 vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
3384 // Extracts the selected signed or unsigned 16-bit integer from a and zero
3386 // https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
3387 // FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
3388 #define _mm_extract_epi16(a, imm) \
3389 vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
3391 // Inserts the least significant 16 bits of b into the selected 16-bit integer
3393 // https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
3394 // FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, const int b,
3395 // __constrange(0,8) int imm)
3396 #define _mm_insert_epi16(a, b, imm) \
3398 vreinterpretq_m128i_s16( \
3399 vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
3402 // Extracts the selected signed or unsigned 32-bit integer from a and zero
3404 // FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
3405 #define _mm_extract_epi32(a, imm) \
3406 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
3408 // Inserts the least significant 32 bits of b into the selected 32-bit integer
3410 // FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, const int b,
3411 // __constrange(0,4) int imm)
3412 #define _mm_insert_epi32(a, b, imm) \
3414 vreinterpretq_m128i_s32( \
3415 vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
3419 // Extracts the selected signed or unsigned 64-bit integer from a and zero
3421 // FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
3422 #define _mm_extract_epi64(a, imm) \
3423 vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
3425 // Inserts the least significant 64 bits of b into the selected 64-bit integer
3427 // FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, const __int64 b,
3428 // __constrange(0,2) int imm)
3429 #define _mm_insert_epi64(a, b, imm) \
3431 vreinterpretq_m128i_s64( \
3432 vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
3435 // ******************************************
3436 // Crypto Extensions
3437 // ******************************************
3438 #if defined(__ARM_FEATURE_CRYPTO)
3440 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
3442 poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
3443 poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
3444 return vreinterpretq_u64_p128(vmull_p64(a, b));
3447 #else // ARMv7 polyfill
3448 // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
3450 // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
3451 // 64-bit->128-bit polynomial multiply.
3453 // It needs some work and is somewhat slow, but it is still faster than all
3454 // known scalar methods.
3456 // Algorithm adapted to C from https://www.workofard.com/2017/07/ghash-for-low-end-cores/,
3457 // which is adapted from "Fast Software Polynomial Multiplication on
3458 // ARM Processors Using the NEON Engine" by Danilo Camara, Conrado Gouvea,
3459 // Julio Lopez and Ricardo Dahab (https://hal.inria.fr/hal-01506572)
3460 static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
3462 poly8x8_t a = vreinterpret_p8_u64(_a);
3463 poly8x8_t b = vreinterpret_p8_u64(_b);
3466 uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff), vcreate_u8(0x00000000ffffffff));
3467 uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff), vcreate_u8(0x0000000000000000));
3469 // Do the multiplies, rotating with vext to get all combinations
3470 uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
3471 uint8x16_t e = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
3472 uint8x16_t f = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
3473 uint8x16_t g = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
3474 uint8x16_t h = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
3475 uint8x16_t i = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
3476 uint8x16_t j = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
3477 uint8x16_t k = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
3479 // Add cross products
3480 uint8x16_t l = veorq_u8(e, f); // L = E + F
3481 uint8x16_t m = veorq_u8(g, h); // M = G + H
3482 uint8x16_t n = veorq_u8(i, j); // N = I + J
3484 // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL instructions.
3485 #if defined(__aarch64__)
3486 uint8x16_t lm_p0 = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
3487 uint8x16_t lm_p1 = vreinterpretq_u8_u64(vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
3488 uint8x16_t nk_p0 = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
3489 uint8x16_t nk_p1 = vreinterpretq_u8_u64(vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
3491 uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
3492 uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
3493 uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
3494 uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
3496 // t0 = (L) (P0 + P1) << 8
3497 // t1 = (M) (P2 + P3) << 16
3498 uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
3499 uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
3500 uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
3502 // t2 = (N) (P4 + P5) << 24
3503 // t3 = (K) (P6 + P7) << 32
3504 uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
3505 uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
3506 uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
3509 #if defined(__aarch64__)
3510 uint8x16_t t0 = vreinterpretq_u8_u64(vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
3511 uint8x16_t t1 = vreinterpretq_u8_u64(vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
3512 uint8x16_t t2 = vreinterpretq_u8_u64(vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
3513 uint8x16_t t3 = vreinterpretq_u8_u64(vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
3515 uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
3516 uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
3517 uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
3518 uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
3520 // Shift the cross products
3521 uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
3522 uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
3523 uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
3524 uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
3526 // Accumulate the products
3527 uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
3528 uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
3529 uint8x16_t mix = veorq_u8(d, cross1);
3530 uint8x16_t r = veorq_u8(mix, cross2);
3531 return vreinterpretq_u64_u8(r);
3534 #endif // ARMv7 polyfill
3535 FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
3537 uint64x2_t a = vreinterpretq_u64_m128i(_a);
3538 uint64x2_t b = vreinterpretq_u64_m128i(_b);
3539 switch (imm & 0x11) {
3540 case 0x00: return vreinterpretq_m128i_u64(_sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
3541 case 0x01: return vreinterpretq_m128i_u64(_sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
3542 case 0x10: return vreinterpretq_m128i_u64(_sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
3543 case 0x11: return vreinterpretq_m128i_u64(_sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
3548 #if !defined(__ARM_FEATURE_CRYPTO) && defined(__aarch64__)
3549 // In the absence of crypto extensions, implement aesenc using regular neon
3550 // intrinsics instead. See:
3551 // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
3552 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
3553 // https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
3554 // for more information Reproduced with permission of the author.
3555 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
3557 static const uint8_t crypto_aes_sbox[256] = {
3558 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b,
3559 0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
3560 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26,
3561 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
3562 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
3563 0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
3564 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed,
3565 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
3566 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f,
3567 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
3568 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec,
3569 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
3570 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14,
3571 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
3572 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
3573 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
3574 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f,
3575 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
3576 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11,
3577 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
3578 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f,
3579 0xb0, 0x54, 0xbb, 0x16};
3580 static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
3581 0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
3582 0xc, 0x1, 0x6, 0xb};
3583 static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
3584 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
3587 uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
3590 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
3593 v = vqtbl4q_u8(vld1q_u8_x4(crypto_aes_sbox), w);
3594 v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x40), w - 0x40);
3595 v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x80), w - 0x80);
3596 v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0xc0), w - 0xc0);
3599 w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
3600 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
3601 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
3604 return vreinterpretq_m128i_u8(w) ^ RoundKey;
3606 #elif defined(__ARM_FEATURE_CRYPTO)
3607 // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
3608 // AESMC and then manually applying the real key as an xor operation This
3609 // unfortunately means an additional xor op; the compiler should be able to
3610 // optimise this away for repeated calls however See
3611 // https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
3612 // for more details.
3613 inline __m128i _mm_aesenc_si128(__m128i a, __m128i b)
3615 return vreinterpretq_m128i_u8(
3616 vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
3617 vreinterpretq_u8_m128i(b));
3621 // ******************************************
3622 // Streaming Extensions
3623 // ******************************************
3625 // Guarantees that every preceding store is globally visible before any
3626 // subsequent store.
3627 // https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
3628 FORCE_INLINE void _mm_sfence(void)
3630 __sync_synchronize();
3633 // Stores the data in a to the address p without polluting the caches. If the
3634 // cache line containing address p is already in the cache, the cache will be
3635 // updated.Address p must be 16 - byte aligned.
3636 // https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
3637 FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
3639 vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
3642 // Cache line containing p is flushed and invalidated from all caches in the
3643 // coherency domain. :
3644 // https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
3645 FORCE_INLINE void _mm_clflush(void const *p)
3648 // no corollary for Neon?
3651 // Allocate aligned blocks of memory.
3652 // https://software.intel.com/en-us/
3653 // cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
3654 FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
3658 return malloc(size);
3659 if (align == 2 || (sizeof(void *) == 8 && align == 4))
3660 align = sizeof(void *);
3661 if (!posix_memalign(&ptr, align, size))
3666 FORCE_INLINE void _mm_free(void *addr)
3671 #if defined(__GNUC__) || defined(__clang__)
3672 #pragma pop_macro("ALIGN_STRUCT")
3673 #pragma pop_macro("FORCE_INLINE")