sse2neon.h

   1 #ifndef SSE2NEON_H
   2 #define SSE2NEON_H
   3
   4 // This header file provides a simple API translation layer
   5 // between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
   6 //
   7 // This header file does not yet translate all of the SSE intrinsics.
   8 //
   9 // Contributors to this work are:
  10 //   John W. Ratcliff <jratcliffscarab@gmail.com>
  11 //   Brandon Rowlett <browlett@nvidia.com>
  12 //   Ken Fast <kfast@gdeb.com>
  13 //   Eric van Beurden <evanbeurden@nvidia.com>
  14 //   Alexander Potylitsin <apotylitsin@nvidia.com>
  15 //   Hasindu Gamaarachchi <hasindu2008@gmail.com>
  16 //   Jim Huang <jserv@biilabs.io>
  17 //   Mark Cheng <marktwtn@biilabs.io>
  18 //   Malcolm James MacLeod <malcolm@gulden.com>
  19 //   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
  20 //   Sebastian Pop <spop@amazon.com>
  21
  22 /*
  23  * The MIT license:
  24  *
  25  * Permission is hereby granted, free of charge, to any person obtaining a copy
  26  * of this software and associated documentation files (the "Software"), to deal
  27  * in the Software without restriction, including without limitation the rights
  28  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  29  * copies of the Software, and to permit persons to whom the Software is
  30  * furnished to do so, subject to the following conditions:
  31  *
  32  * The above copyright notice and this permission notice shall be included in
  33  * all copies or substantial portions of the Software.
  34  *
  35  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  36  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  37  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  38  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  39  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  40  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  41  * SOFTWARE.
  42  */
  43
  44 #if defined(__GNUC__) || defined(__clang__)
  45
  46 #pragma push_macro("FORCE_INLINE")
  47 #pragma push_macro("ALIGN_STRUCT")
  48 #define FORCE_INLINE static inline __attribute__((always_inline))
  49 #define ALIGN_STRUCT(x) __attribute__((aligned(x)))
  50
  51 #else
  52
  53 #error "Macro name collisions may happens with unknown compiler"
  54 #ifdef FORCE_INLINE
  55 #undef FORCE_INLINE
  56 #endif
  57 #define FORCE_INLINE static inline
  58 #ifndef ALIGN_STRUCT
  59 #define ALIGN_STRUCT(x) __declspec(align(x))
  60 #endif
  61
  62 #endif
  63
  64 #include <stdint.h>
  65 #include <stdlib.h>
  66
  67 #include <arm_neon.h>
  68
  69 /**
  70  * MACRO for shuffle parameter for _mm_shuffle_ps().
  71  * Argument fp3 is a digit[0123] that represents the fp from argument "b"
  72  * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
  73  * for fp2 in result. fp1 is a digit[0123] that represents the fp from
  74  * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
  75  * fp0 is the same for fp0 of result.
  76  */
  77 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
  78     (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
  79
  80 /* indicate immediate constant argument in a given range */
  81 #define __constrange(a, b) const
  82
  83 typedef float32x2_t __m64;
  84 typedef float32x4_t __m128;
  85 typedef int64x2_t __m128i;
  86
  87 // ******************************************
  88 // type-safe casting between types
  89 // ******************************************
  90
  91 #define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
  92 #define vreinterpretq_m128_f32(x) (x)
  93 #define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
  94
  95 #define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
  96 #define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
  97 #define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
  98 #define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
  99
 100 #define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
 101 #define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
 102 #define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
 103 #define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
 104
 105 #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
 106 #define vreinterpretq_f32_m128(x) (x)
 107 #define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
 108
 109 #define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
 110 #define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
 111 #define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
 112 #define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
 113
 114 #define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
 115 #define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
 116 #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
 117 #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
 118
 119 #define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
 120 #define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
 121 #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
 122 #define vreinterpretq_m128i_s64(x) (x)
 123
 124 #define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
 125 #define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
 126 #define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
 127 #define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
 128
 129 #define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
 130 #define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
 131 #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
 132 #define vreinterpretq_s64_m128i(x) (x)
 133
 134 #define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
 135 #define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
 136 #define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
 137 #define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
 138
 139 // A struct is defined in this header file called 'SIMDVec' which can be used
 140 // by applications which attempt to access the contents of an _m128 struct
 141 // directly.  It is important to note that accessing the __m128 struct directly
 142 // is bad coding practice by Microsoft: @see:
 143 // https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
 144 //
 145 // However, some legacy source code may try to access the contents of an __m128
 146 // struct directly so the developer can use the SIMDVec as an alias for it.  Any
 147 // casting must be done manually by the developer, as you cannot cast or
 148 // otherwise alias the base NEON data type for intrinsic operations.
 149 //
 150 // union intended to allow direct access to an __m128 variable using the names
 151 // that the MSVC compiler provides.  This union should really only be used when
 152 // trying to access the members of the vector as integer values.  GCC/clang
 153 // allow native access to the float members through a simple array access
 154 // operator (in C since 4.6, in C++ since 4.8).
 155 //
 156 // Ideally direct accesses to SIMD vectors should not be used since it can cause
 157 // a performance hit.  If it really is needed however, the original __m128
 158 // variable can be aliased with a pointer to this union and used to access
 159 // individual components.  The use of this union should be hidden behind a macro
 160 // that is used throughout the codebase to access the members instead of always
 161 // declaring this type of variable.
 162 typedef union ALIGN_STRUCT(16) SIMDVec {
 163     float m128_f32[4];     // as floats - do not to use this.  Added for convenience.
 164     int8_t m128_i8[16];    // as signed 8-bit integers.
 165     int16_t m128_i16[8];   // as signed 16-bit integers.
 166     int32_t m128_i32[4];   // as signed 32-bit integers.
 167     int64_t m128_i64[2];   // as signed 64-bit integers.
 168     uint8_t m128_u8[16];   // as unsigned 8-bit integers.
 169     uint16_t m128_u16[8];  // as unsigned 16-bit integers.
 170     uint32_t m128_u32[4];  // as unsigned 32-bit integers.
 171     uint64_t m128_u64[2];  // as unsigned 64-bit integers.
 172 } SIMDVec;
 173
 174 // casting using SIMDVec
 175 #define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
 176 #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
 177
 178
 179 // ******************************************
 180 // Backwards compatibility for compilers with lack of specific type support
 181 // ******************************************
 182
 183 // Older gcc does not define vld1q_u8_x4 type
 184 #if defined(__GNUC__) && !defined(__clang__)
 185 #if __GNUC__ <= 9
 186 FORCE_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t *p)
 187 {
 188     uint8x16x4_t ret;
 189     ret.val[0] = vld1q_u8(p + 0);
 190     ret.val[1] = vld1q_u8(p + 16);
 191     ret.val[2] = vld1q_u8(p + 32);
 192     ret.val[3] = vld1q_u8(p + 48);
 193     return ret;
 194 }
 195 #endif
 196 #endif
 197
 198
 199 // ******************************************
 200 // Set/get methods
 201 // ******************************************
 202
 203 // Loads one cache line of data from address p to a location closer to the
 204 // processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
 205 FORCE_INLINE void _mm_prefetch(const void *p, int i)
 206 {
 207     (void)i;
 208     __builtin_prefetch(p);
 209 }
 210
 211 // extracts the lower order floating point value from the parameter :
 212 // https://msdn.microsoft.com/en-us/library/bb514059%28v=vs.120%29.aspx?f=255&MSPPError=-2147217396
 213 FORCE_INLINE float _mm_cvtss_f32(__m128 a)
 214 {
 215     return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
 216 }
 217
 218 // Sets the 128-bit value to zero
 219 // https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
 220 FORCE_INLINE __m128i _mm_setzero_si128(void)
 221 {
 222     return vreinterpretq_m128i_s32(vdupq_n_s32(0));
 223 }
 224
 225 // Clears the four single-precision, floating-point values.
 226 // https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
 227 FORCE_INLINE __m128 _mm_setzero_ps(void)
 228 {
 229     return vreinterpretq_m128_f32(vdupq_n_f32(0));
 230 }
 231
 232 // Sets the four single-precision, floating-point values to w.
 233 //
 234 //   r0 := r1 := r2 := r3 := w
 235 //
 236 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
 237 FORCE_INLINE __m128 _mm_set1_ps(float _w)
 238 {
 239     return vreinterpretq_m128_f32(vdupq_n_f32(_w));
 240 }
 241
 242 // Sets the four single-precision, floating-point values to w.
 243 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
 244 FORCE_INLINE __m128 _mm_set_ps1(float _w)
 245 {
 246     return vreinterpretq_m128_f32(vdupq_n_f32(_w));
 247 }
 248
 249 // Sets the four single-precision, floating-point values to the four inputs.
 250 // https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
 251 FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
 252 {
 253     float __attribute__((aligned(16))) data[4] = {x, y, z, w};
 254     return vreinterpretq_m128_f32(vld1q_f32(data));
 255 }
 256
 257 // Sets the four single-precision, floating-point values to the four inputs in
 258 // reverse order.
 259 // https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
 260 FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
 261 {
 262     float __attribute__((aligned(16))) data[4] = {w, z, y, x};
 263     return vreinterpretq_m128_f32(vld1q_f32(data));
 264 }
 265
 266 // Sets the 8 signed 16-bit integer values in reverse order.
 267 //
 268 // Return Value
 269 //   r0 := w0
 270 //   r1 := w1
 271 //   ...
 272 //   r7 := w7
 273 FORCE_INLINE __m128i _mm_setr_epi16(short w0,
 274                                     short w1,
 275                                     short w2,
 276                                     short w3,
 277                                     short w4,
 278                                     short w5,
 279                                     short w6,
 280                                     short w7)
 281 {
 282     int16_t __attribute__((aligned(16)))
 283     data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
 284     return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
 285 }
 286
 287 // Sets the 4 signed 32-bit integer values in reverse order
 288 // https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
 289 FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
 290 {
 291     int32_t __attribute__((aligned(16))) data[4] = {i3, i2, i1, i0};
 292     return vreinterpretq_m128i_s32(vld1q_s32(data));
 293 }
 294
 295 // Sets the 16 signed 8-bit integer values to b.
 296 //
 297 //   r0 := b
 298 //   r1 := b
 299 //   ...
 300 //   r15 := b
 301 //
 302 // https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
 303 FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
 304 {
 305     return vreinterpretq_m128i_s8(vdupq_n_s8(w));
 306 }
 307
 308 // Sets the 8 signed 16-bit integer values to w.
 309 //
 310 //   r0 := w
 311 //   r1 := w
 312 //   ...
 313 //   r7 := w
 314 //
 315 // https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
 316 FORCE_INLINE __m128i _mm_set1_epi16(short w)
 317 {
 318     return vreinterpretq_m128i_s16(vdupq_n_s16(w));
 319 }
 320
 321 // Sets the 16 signed 8-bit integer values.
 322 // https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
 323 FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
 324                                   signed char b14,
 325                                   signed char b13,
 326                                   signed char b12,
 327                                   signed char b11,
 328                                   signed char b10,
 329                                   signed char b9,
 330                                   signed char b8,
 331                                   signed char b7,
 332                                   signed char b6,
 333                                   signed char b5,
 334                                   signed char b4,
 335                                   signed char b3,
 336                                   signed char b2,
 337                                   signed char b1,
 338                                   signed char b0)
 339 {
 340     int8_t __attribute__((aligned(16)))
 341     data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
 342                 (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
 343                 (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
 344                 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
 345     return (__m128i) vld1q_s8(data);
 346 }
 347
 348 // Sets the 8 signed 16-bit integer values.
 349 // https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
 350 FORCE_INLINE __m128i _mm_set_epi16(short i7,
 351                                    short i6,
 352                                    short i5,
 353                                    short i4,
 354                                    short i3,
 355                                    short i2,
 356                                    short i1,
 357                                    short i0)
 358 {
 359     int16_t __attribute__((aligned(16)))
 360     data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
 361     return vreinterpretq_m128i_s16(vld1q_s16(data));
 362 }
 363
 364 // Sets the 16 signed 8-bit integer values in reverse order.
 365 // https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
 366 FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
 367                                    signed char b1,
 368                                    signed char b2,
 369                                    signed char b3,
 370                                    signed char b4,
 371                                    signed char b5,
 372                                    signed char b6,
 373                                    signed char b7,
 374                                    signed char b8,
 375                                    signed char b9,
 376                                    signed char b10,
 377                                    signed char b11,
 378                                    signed char b12,
 379                                    signed char b13,
 380                                    signed char b14,
 381                                    signed char b15)
 382 {
 383     int8_t __attribute__((aligned(16)))
 384     data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
 385                 (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
 386                 (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
 387                 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
 388     return (__m128i) vld1q_s8(data);
 389 }
 390
 391 // Sets the 4 signed 32-bit integer values to i.
 392 //
 393 //   r0 := i
 394 //   r1 := i
 395 //   r2 := i
 396 //   r3 := I
 397 //
 398 // https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
 399 FORCE_INLINE __m128i _mm_set1_epi32(int _i)
 400 {
 401     return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
 402 }
 403
 404 // Sets the 2 signed 64-bit integer values to i.
 405 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
 406 FORCE_INLINE __m128i _mm_set1_epi64(int64_t _i)
 407 {
 408     return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
 409 }
 410
 411 // Sets the 2 signed 64-bit integer values to i.
 412 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x&expand=4961
 413 FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
 414 {
 415     return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
 416 }
 417
 418 // Sets the 4 signed 32-bit integer values.
 419 // https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
 420 FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
 421 {
 422     int32_t __attribute__((aligned(16))) data[4] = {i0, i1, i2, i3};
 423     return vreinterpretq_m128i_s32(vld1q_s32(data));
 424 }
 425
 426 // Returns the __m128i structure with its two 64-bit integer values
 427 // initialized to the values of the two 64-bit integers passed in.
 428 // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
 429 FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
 430 {
 431     int64_t __attribute__((aligned(16))) data[2] = {i2, i1};
 432     return vreinterpretq_m128i_s64(vld1q_s64(data));
 433 }
 434
 435 // Stores four single-precision, floating-point values.
 436 // https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
 437 FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
 438 {
 439     vst1q_f32(p, vreinterpretq_f32_m128(a));
 440 }
 441
 442 // Stores four single-precision, floating-point values.
 443 // https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
 444 FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
 445 {
 446     vst1q_f32(p, vreinterpretq_f32_m128(a));
 447 }
 448
 449 // Stores four 32-bit integer values as (as a __m128i value) at the address p.
 450 // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
 451 FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
 452 {
 453     vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
 454 }
 455
 456 // Stores four 32-bit integer values as (as a __m128i value) at the address p.
 457 // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
 458 FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
 459 {
 460     vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
 461 }
 462
 463 // Stores the lower single - precision, floating - point value.
 464 // https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
 465 FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
 466 {
 467     vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
 468 }
 469
 470 // Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
 471 // https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
 472 FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
 473 {
 474     uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
 475     uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
 476     *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
 477 }
 478
 479 // Stores the lower two single-precision floating point values of a to the
 480 // address p.
 481 //
 482 //   *p0 := a0
 483 //   *p1 := a1
 484 //
 485 // https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
 486 FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
 487 {
 488     *p = vget_low_f32(a);
 489 }
 490
 491 // Stores the upper two single-precision, floating-point values of a to the
 492 // address p.
 493 //
 494 //   *p0 := a2
 495 //   *p1 := a3
 496 //
 497 // https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
 498 FORCE_INLINE void _mm_storeh_pi(__m64 * p, __m128 a)
 499 {
 500     *p = vget_high_f32(a);
 501 }
 502
 503 // Loads a single single-precision, floating-point value, copying it into all
 504 // four words
 505 // https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
 506 FORCE_INLINE __m128 _mm_load1_ps(const float *p)
 507 {
 508     return vreinterpretq_m128_f32(vld1q_dup_f32(p));
 509 }
 510 #define _mm_load_ps1 _mm_load1_ps
 511
 512 // Sets the lower two single-precision, floating-point values with 64
 513 // bits of data loaded from the address p; the upper two values are passed
 514 // through from a.
 515 //
 516 // Return Value
 517 //   r0 := *p0
 518 //   r1 := *p1
 519 //   r2 := a2
 520 //   r3 := a3
 521 //
 522 // https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
 523 FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
 524 {
 525     return vreinterpretq_m128_f32(
 526         vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
 527 }
 528
 529 // Sets the upper two single-precision, floating-point values with 64
 530 // bits of data loaded from the address p; the lower two values are passed
 531 // through from a.
 532 //
 533 //   r0 := a0
 534 //   r1 := a1
 535 //   r2 := *p0
 536 //   r3 := *p1
 537 //
 538 // https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
 539 FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
 540 {
 541     return vreinterpretq_m128_f32(
 542         vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
 543 }
 544
 545 // Loads four single-precision, floating-point values.
 546 // https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
 547 FORCE_INLINE __m128 _mm_load_ps(const float *p)
 548 {
 549     return vreinterpretq_m128_f32(vld1q_f32(p));
 550 }
 551
 552 // Loads four single-precision, floating-point values.
 553 // https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
 554 FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
 555 {
 556     // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
 557     // equivalent for neon
 558     return vreinterpretq_m128_f32(vld1q_f32(p));
 559 }
 560
 561 // Loads an single - precision, floating - point value into the low word and
 562 // clears the upper three words.
 563 // https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
 564 FORCE_INLINE __m128 _mm_load_ss(const float *p)
 565 {
 566     return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
 567 }
 568
 569 FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
 570 {
 571     /* Load the lower 64 bits of the value pointed to by p into the
 572      * lower 64 bits of the result, zeroing the upper 64 bits of the result.
 573      */
 574     return vreinterpretq_m128i_s32(vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
 575 }
 576
 577 // ******************************************
 578 // Logic/Binary operations
 579 // ******************************************
 580
 581 // Compares for inequality.
 582 // https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
 583 FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
 584 {
 585     return vreinterpretq_m128_u32(vmvnq_u32(
 586         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
 587 }
 588
 589 // Computes the bitwise AND-NOT of the four single-precision, floating-point
 590 // values of a and b.
 591 //
 592 //   r0 := ~a0 & b0
 593 //   r1 := ~a1 & b1
 594 //   r2 := ~a2 & b2
 595 //   r3 := ~a3 & b3
 596 //
 597 // https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
 598 FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
 599 {
 600     return vreinterpretq_m128_s32(
 601         vbicq_s32(vreinterpretq_s32_m128(b),
 602                   vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
 603 }
 604
 605 // Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
 606 // 128-bit value in a.
 607 //
 608 //   r := (~a) & b
 609 //
 610 // https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
 611 FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
 612 {
 613     return vreinterpretq_m128i_s32(
 614         vbicq_s32(vreinterpretq_s32_m128i(b),
 615                   vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
 616 }
 617
 618 // Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
 619 // b.
 620 //
 621 //   r := a & b
 622 //
 623 // https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
 624 FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
 625 {
 626     return vreinterpretq_m128i_s32(
 627         vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 628 }
 629
 630 // Computes the bitwise AND of the four single-precision, floating-point values
 631 // of a and b.
 632 //
 633 //   r0 := a0 & b0
 634 //   r1 := a1 & b1
 635 //   r2 := a2 & b2
 636 //   r3 := a3 & b3
 637 //
 638 // https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
 639 FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
 640 {
 641     return vreinterpretq_m128_s32(
 642         vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
 643 }
 644
 645 // Computes the bitwise OR of the four single-precision, floating-point values
 646 // of a and b.
 647 // https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
 648 FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
 649 {
 650     return vreinterpretq_m128_s32(
 651         vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
 652 }
 653
 654 // Computes bitwise EXOR (exclusive-or) of the four single-precision,
 655 // floating-point values of a and b.
 656 // https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
 657 FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
 658 {
 659     return vreinterpretq_m128_s32(
 660         veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
 661 }
 662
 663 // Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
 664 //
 665 //   r := a | b
 666 //
 667 // https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
 668 FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
 669 {
 670     return vreinterpretq_m128i_s32(
 671         vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 672 }
 673
 674 // Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
 675 // b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
 676 FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
 677 {
 678     return vreinterpretq_m128i_s32(
 679         veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 680 }
 681
 682 // Moves the upper two values of B into the lower two values of A.
 683 //
 684 //   r3 := a3
 685 //   r2 := a2
 686 //   r1 := b3
 687 //   r0 := b2
 688 FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
 689 {
 690     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
 691     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
 692     return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
 693 }
 694
 695 // Moves the lower two values of B into the upper two values of A.
 696 //
 697 //   r3 := b1
 698 //   r2 := b0
 699 //   r1 := a1
 700 //   r0 := a0
 701 FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
 702 {
 703     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
 704     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
 705     return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
 706 }
 707
 708 FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
 709 {
 710     return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
 711 }
 712
 713 FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
 714 {
 715     return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
 716 }
 717
 718 FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
 719 {
 720     return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
 721 }
 722
 723 // Takes the upper 64 bits of a and places it in the low end of the result
 724 // Takes the lower 64 bits of b and places it into the high end of the result.
 725 FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
 726 {
 727     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
 728     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
 729     return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
 730 }
 731
 732 // takes the lower two 32-bit values from a and swaps them and places in high
 733 // end of result takes the higher two 32 bit values from b and swaps them and
 734 // places in low end of result.
 735 FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
 736 {
 737     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
 738     float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
 739     return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
 740 }
 741
 742 FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
 743 {
 744     float32x2_t a21 = vget_high_f32(
 745         vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
 746     float32x2_t b03 = vget_low_f32(
 747         vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
 748     return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
 749 }
 750
 751 FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
 752 {
 753     float32x2_t a03 = vget_low_f32(
 754         vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
 755     float32x2_t b21 = vget_high_f32(
 756         vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
 757     return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
 758 }
 759
 760 FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
 761 {
 762     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
 763     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
 764     return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
 765 }
 766
 767 FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
 768 {
 769     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
 770     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
 771     return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
 772 }
 773
 774 FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
 775 {
 776     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
 777     float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
 778     return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
 779 }
 780
 781 // keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
 782 // high
 783 FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
 784 {
 785     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
 786     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
 787     return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
 788 }
 789
 790 FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
 791 {
 792     float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
 793     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
 794     return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
 795 }
 796
 797 FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
 798 {
 799     float32x2_t a22 =
 800         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
 801     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
 802     return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
 803 }
 804
 805 FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
 806 {
 807     float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
 808     float32x2_t b22 =
 809         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
 810     return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
 811 }
 812
 813 FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
 814 {
 815     float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
 816     float32x2_t a22 =
 817         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
 818     float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
 819     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
 820     return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
 821 }
 822
 823 FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
 824 {
 825     float32x2_t a33 =
 826         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
 827     float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
 828     return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
 829 }
 830
 831 FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
 832 {
 833     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
 834     float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
 835     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
 836     float32x2_t b20 = vset_lane_f32(b2, b00, 1);
 837     return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
 838 }
 839
 840 FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
 841 {
 842     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
 843     float32_t b2 = vgetq_lane_f32(b, 2);
 844     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
 845     float32x2_t b20 = vset_lane_f32(b2, b00, 1);
 846     return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
 847 }
 848
 849 FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
 850 {
 851     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
 852     float32_t b2 = vgetq_lane_f32(b, 2);
 853     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
 854     float32x2_t b20 = vset_lane_f32(b2, b00, 1);
 855     return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
 856 }
 857
 858 // NEON does not support a general purpose permute intrinsic
 859 // Selects four specific single-precision, floating-point values from a and b,
 860 // based on the mask i.
 861 // https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
 862 #if 0 /* C version */
 863 FORCE_INLINE __m128 _mm_shuffle_ps_default(__m128 a,
 864                                            __m128 b,
 865                                            __constrange(0, 255) int imm)
 866 {
 867     __m128 ret;
 868     ret[0] = a[imm & 0x3];
 869     ret[1] = a[(imm >> 2) & 0x3];
 870     ret[2] = b[(imm >> 4) & 0x03];
 871     ret[3] = b[(imm >> 6) & 0x03];
 872     return ret;
 873 }
 874 #endif
 875 #define _mm_shuffle_ps_default(a, b, imm)                                  \
 876     __extension__({                                                        \
 877         float32x4_t ret;                                                   \
 878         ret = vmovq_n_f32(                                                 \
 879             vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) &0x3));        \
 880         ret = vsetq_lane_f32(                                              \
 881             vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
 882             ret, 1);                                                       \
 883         ret = vsetq_lane_f32(                                              \
 884             vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
 885             ret, 2);                                                       \
 886         ret = vsetq_lane_f32(                                              \
 887             vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
 888             ret, 3);                                                       \
 889         vreinterpretq_m128_f32(ret);                                       \
 890     })
 891
 892 // FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
 893 // int imm)
 894 #if defined(__clang__)
 895 #define _mm_shuffle_ps(a, b, imm)                          \
 896     __extension__({                                        \
 897          float32x4_t _input1 = vreinterpretq_f32_m128(a);  \
 898          float32x4_t _input2 = vreinterpretq_f32_m128(b);  \
 899          float32x4_t _shuf =                               \
 900               __builtin_shufflevector(_input1, _input2,    \
 901                 (imm) & 0x3,                               \
 902                 ((imm) >> 2) & 0x3,                        \
 903                 (((imm) >> 4) & 0x3) + 4,                  \
 904                 (((imm) >> 6) & 0x3) + 4);                 \
 905          vreinterpretq_m128_f32(_shuf);                    \
 906     })
 907 #else // generic
 908 #define _mm_shuffle_ps(a, b, imm)                          \
 909     __extension__({                                        \
 910         __m128 ret;                                        \
 911         switch (imm) {                                     \
 912         case _MM_SHUFFLE(1, 0, 3, 2):                      \
 913             ret = _mm_shuffle_ps_1032((a), (b));           \
 914             break;                                         \
 915         case _MM_SHUFFLE(2, 3, 0, 1):                      \
 916             ret = _mm_shuffle_ps_2301((a), (b));           \
 917             break;                                         \
 918         case _MM_SHUFFLE(0, 3, 2, 1):                      \
 919             ret = _mm_shuffle_ps_0321((a), (b));           \
 920             break;                                         \
 921         case _MM_SHUFFLE(2, 1, 0, 3):                      \
 922             ret = _mm_shuffle_ps_2103((a), (b));           \
 923             break;                                         \
 924         case _MM_SHUFFLE(1, 0, 1, 0):                      \
 925             ret = _mm_movelh_ps((a), (b));                 \
 926             break;                                         \
 927         case _MM_SHUFFLE(1, 0, 0, 1):                      \
 928             ret = _mm_shuffle_ps_1001((a), (b));           \
 929             break;                                         \
 930         case _MM_SHUFFLE(0, 1, 0, 1):                      \
 931             ret = _mm_shuffle_ps_0101((a), (b));           \
 932             break;                                         \
 933         case _MM_SHUFFLE(3, 2, 1, 0):                      \
 934             ret = _mm_shuffle_ps_3210((a), (b));           \
 935             break;                                         \
 936         case _MM_SHUFFLE(0, 0, 1, 1):                      \
 937             ret = _mm_shuffle_ps_0011((a), (b));           \
 938             break;                                         \
 939         case _MM_SHUFFLE(0, 0, 2, 2):                      \
 940             ret = _mm_shuffle_ps_0022((a), (b));           \
 941             break;                                         \
 942         case _MM_SHUFFLE(2, 2, 0, 0):                      \
 943             ret = _mm_shuffle_ps_2200((a), (b));           \
 944             break;                                         \
 945         case _MM_SHUFFLE(3, 2, 0, 2):                      \
 946             ret = _mm_shuffle_ps_3202((a), (b));           \
 947             break;                                         \
 948         case _MM_SHUFFLE(3, 2, 3, 2):                      \
 949             ret = _mm_movehl_ps((b), (a));                 \
 950             break;                                         \
 951         case _MM_SHUFFLE(1, 1, 3, 3):                      \
 952             ret = _mm_shuffle_ps_1133((a), (b));           \
 953             break;                                         \
 954         case _MM_SHUFFLE(2, 0, 1, 0):                      \
 955             ret = _mm_shuffle_ps_2010((a), (b));           \
 956             break;                                         \
 957         case _MM_SHUFFLE(2, 0, 0, 1):                      \
 958             ret = _mm_shuffle_ps_2001((a), (b));           \
 959             break;                                         \
 960         case _MM_SHUFFLE(2, 0, 3, 2):                      \
 961             ret = _mm_shuffle_ps_2032((a), (b));           \
 962             break;                                         \
 963         default:                                           \
 964             ret = _mm_shuffle_ps_default((a), (b), (imm)); \
 965             break;                                         \
 966         }                                                  \
 967         ret;                                               \
 968     })
 969 #endif // not clang
 970
 971 // Takes the upper 64 bits of a and places it in the low end of the result
 972 // Takes the lower 64 bits of a and places it into the high end of the result.
 973 FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
 974 {
 975     int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
 976     int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
 977     return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
 978 }
 979
 980 // takes the lower two 32-bit values from a and swaps them and places in low end
 981 // of result takes the higher two 32 bit values from a and swaps them and places
 982 // in high end of result.
 983 FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
 984 {
 985     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
 986     int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
 987     return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
 988 }
 989
 990 // rotates the least significant 32 bits into the most signficant 32 bits, and
 991 // shifts the rest down
 992 FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
 993 {
 994     return vreinterpretq_m128i_s32(
 995         vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
 996 }
 997
 998 // rotates the most significant 32 bits into the least signficant 32 bits, and
 999 // shifts the rest up
1000 FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
1001 {
1002     return vreinterpretq_m128i_s32(
1003         vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
1004 }
1005
1006 // gets the lower 64 bits of a, and places it in the upper 64 bits
1007 // gets the lower 64 bits of a and places it in the lower 64 bits
1008 FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
1009 {
1010     int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1011     return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
1012 }
1013
1014 // gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
1015 // lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
1016 FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
1017 {
1018     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1019     int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1020     return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
1021 }
1022
1023 // gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
1024 // upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
1025 // places it in the lower 64 bits
1026 FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
1027 {
1028     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1029     return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
1030 }
1031
1032 FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
1033 {
1034     int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
1035     int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1036     return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
1037 }
1038
1039 FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
1040 {
1041     int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1042     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1043     return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
1044 }
1045
1046 FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
1047 {
1048     int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1049     int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
1050     return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
1051 }
1052
1053 // Shuffle packed 8-bit integers in a according to shuffle control mask in the
1054 // corresponding 8-bit element of b, and store the results in dst.
1055 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8&expand=5146
1056 FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
1057 {
1058     int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
1059     uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
1060     uint8x16_t idx_masked =
1061         vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
1062 #if defined(__aarch64__)
1063     return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
1064 #elif defined(__GNUC__)
1065
1066     int8x16_t ret;
1067     // %e and %f represent the even and odd D registers
1068     // respectively.
1069     __asm__(
1070        "    vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
1071        "    vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
1072      : [ret] "=&w" (ret)
1073      : [tbl] "w" (tbl), [idx] "w" (idx_masked));
1074     return vreinterpretq_m128i_s8(ret);
1075 #else
1076     // use this line if testing on aarch64
1077     int8x8x2_t a_split = { vget_low_s8(tbl), vget_high_s8(tbl) };
1078     return vreinterpretq_m128i_s8(
1079         vcombine_s8(
1080             vtbl2_s8(a_split, vget_low_u8(idx_masked)),
1081             vtbl2_s8(a_split, vget_high_u8(idx_masked))
1082         )
1083     );
1084 #endif
1085 }
1086
1087
1088 #if 0 /* C version */
1089 FORCE_INLINE __m128i _mm_shuffle_epi32_default(__m128i a,
1090                                                __constrange(0, 255) int imm)
1091 {
1092     __m128i ret;
1093     ret[0] = a[imm & 0x3];
1094     ret[1] = a[(imm >> 2) & 0x3];
1095     ret[2] = a[(imm >> 4) & 0x03];
1096     ret[3] = a[(imm >> 6) & 0x03];
1097     return ret;
1098 }
1099 #endif
1100 #define _mm_shuffle_epi32_default(a, imm)                                   \
1101     __extension__({                                                         \
1102         int32x4_t ret;                                                      \
1103         ret = vmovq_n_s32(                                                  \
1104             vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) &0x3));        \
1105         ret = vsetq_lane_s32(                                               \
1106             vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
1107             ret, 1);                                                        \
1108         ret = vsetq_lane_s32(                                               \
1109             vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
1110             ret, 2);                                                        \
1111         ret = vsetq_lane_s32(                                               \
1112             vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
1113             ret, 3);                                                        \
1114         vreinterpretq_m128i_s32(ret);                                       \
1115     })
1116
1117 // FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
1118 // int imm)
1119 #if defined(__aarch64__)
1120 #define _mm_shuffle_epi32_splat(a, imm)                          \
1121     __extension__({                                              \
1122         vreinterpretq_m128i_s32(                                 \
1123             vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
1124     })
1125 #else
1126 #define _mm_shuffle_epi32_splat(a, imm)                                      \
1127     __extension__({                                                          \
1128         vreinterpretq_m128i_s32(                                             \
1129             vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
1130     })
1131 #endif
1132
1133 // Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
1134 // https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
1135 // FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, __constrange(0,255) int
1136 // imm)
1137 #if defined(__clang__)
1138 #define _mm_shuffle_epi32(a, imm)                        \
1139     __extension__({                                      \
1140          int32x4_t _input = vreinterpretq_s32_m128i(a);  \
1141          int32x4_t _shuf =                               \
1142               __builtin_shufflevector(_input, _input,    \
1143                 (imm) & 0x3,        ((imm) >> 2) & 0x3,  \
1144                 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
1145          vreinterpretq_m128i_s32(_shuf);                 \
1146     })
1147 #else // generic
1148 #define _mm_shuffle_epi32(a, imm)                        \
1149     __extension__({                                      \
1150         __m128i ret;                                     \
1151         switch (imm) {                                   \
1152         case _MM_SHUFFLE(1, 0, 3, 2):                    \
1153             ret = _mm_shuffle_epi_1032((a));             \
1154             break;                                       \
1155         case _MM_SHUFFLE(2, 3, 0, 1):                    \
1156             ret = _mm_shuffle_epi_2301((a));             \
1157             break;                                       \
1158         case _MM_SHUFFLE(0, 3, 2, 1):                    \
1159             ret = _mm_shuffle_epi_0321((a));             \
1160             break;                                       \
1161         case _MM_SHUFFLE(2, 1, 0, 3):                    \
1162             ret = _mm_shuffle_epi_2103((a));             \
1163             break;                                       \
1164         case _MM_SHUFFLE(1, 0, 1, 0):                    \
1165             ret = _mm_shuffle_epi_1010((a));             \
1166             break;                                       \
1167         case _MM_SHUFFLE(1, 0, 0, 1):                    \
1168             ret = _mm_shuffle_epi_1001((a));             \
1169             break;                                       \
1170         case _MM_SHUFFLE(0, 1, 0, 1):                    \
1171             ret = _mm_shuffle_epi_0101((a));             \
1172             break;                                       \
1173         case _MM_SHUFFLE(2, 2, 1, 1):                    \
1174             ret = _mm_shuffle_epi_2211((a));             \
1175             break;                                       \
1176         case _MM_SHUFFLE(0, 1, 2, 2):                    \
1177             ret = _mm_shuffle_epi_0122((a));             \
1178             break;                                       \
1179         case _MM_SHUFFLE(3, 3, 3, 2):                    \
1180             ret = _mm_shuffle_epi_3332((a));             \
1181             break;                                       \
1182         case _MM_SHUFFLE(0, 0, 0, 0):                    \
1183             ret = _mm_shuffle_epi32_splat((a), 0);       \
1184             break;                                       \
1185         case _MM_SHUFFLE(1, 1, 1, 1):                    \
1186             ret = _mm_shuffle_epi32_splat((a), 1);       \
1187             break;                                       \
1188         case _MM_SHUFFLE(2, 2, 2, 2):                    \
1189             ret = _mm_shuffle_epi32_splat((a), 2);       \
1190             break;                                       \
1191         case _MM_SHUFFLE(3, 3, 3, 3):                    \
1192             ret = _mm_shuffle_epi32_splat((a), 3);       \
1193             break;                                       \
1194         default:                                         \
1195             ret = _mm_shuffle_epi32_default((a), (imm)); \
1196             break;                                       \
1197         }                                                \
1198         ret;                                             \
1199     })
1200 #endif // not clang
1201
1202 // Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
1203 // by imm.
1204 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
1205 // FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
1206 // __constrange(0,255) int imm)
1207
1208 #define _mm_shufflelo_epi16_function(a, imm)                                  \
1209     __extension__({                                                           \
1210         int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
1211         int16x4_t lowBits = vget_low_s16(ret);                                \
1212         ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) &0x3), ret, 0);     \
1213         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
1214                              1);                                              \
1215         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
1216                              2);                                              \
1217         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
1218                              3);                                              \
1219         vreinterpretq_m128i_s16(ret);                                         \
1220     })
1221
1222 // FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a, __constrange(0,255) int
1223 // imm)
1224 #if defined(__clang__)
1225 #define _mm_shufflelo_epi16(a, imm)                      \
1226     __extension__({                                      \
1227          int16x8_t _input = vreinterpretq_s16_m128i(a);  \
1228          int16x8_t _shuf =                               \
1229               __builtin_shufflevector(_input, _input,    \
1230                 ((imm) & 0x3),                           \
1231                 (((imm) >> 2) & 0x3),                    \
1232                 (((imm) >> 4) & 0x3),                    \
1233                 (((imm) >> 6) & 0x3),                    \
1234                 4, 5, 6, 7);                             \
1235          vreinterpretq_m128i_s16(_shuf);                 \
1236     })
1237 #else // generic
1238 #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
1239 #endif
1240
1241 // Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
1242 // by imm.
1243 // https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
1244 // FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
1245 // __constrange(0,255) int imm)
1246 #define _mm_shufflehi_epi16_function(a, imm)                                   \
1247     __extension__({                                                            \
1248         int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
1249         int16x4_t highBits = vget_high_s16(ret);                               \
1250         ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) &0x3), ret, 4);     \
1251         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
1252                              5);                                               \
1253         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
1254                              6);                                               \
1255         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
1256                              7);                                               \
1257         vreinterpretq_m128i_s16(ret);                                          \
1258     })
1259
1260 // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, __constrange(0,255) int
1261 // imm)
1262 #if defined(__clang__)
1263 #define _mm_shufflehi_epi16(a, imm)                      \
1264     __extension__({                                      \
1265          int16x8_t _input = vreinterpretq_s16_m128i(a);  \
1266          int16x8_t _shuf =                               \
1267               __builtin_shufflevector(_input, _input,    \
1268                 0, 1, 2, 3,                              \
1269                 ((imm) & 0x3) + 4,                       \
1270                 (((imm) >> 2) & 0x3) + 4,                \
1271                 (((imm) >> 4) & 0x3) + 4,                \
1272                 (((imm) >> 6) & 0x3) + 4);               \
1273          vreinterpretq_m128i_s16(_shuf);                 \
1274     })
1275 #else // generic
1276 #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
1277 #endif
1278
1279 // Blend packed 16-bit integers from a and b using control mask imm8, and store
1280 // the results in dst.
1281 //
1282 //   FOR j := 0 to 7
1283 //       i := j*16
1284 //       IF imm8[j]
1285 //           dst[i+15:i] := b[i+15:i]
1286 //       ELSE
1287 //           dst[i+15:i] := a[i+15:i]
1288 //       FI
1289 //   ENDFOR
1290 // FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b, __constrange(0,255)
1291 // int imm)
1292 #define _mm_blend_epi16(a, b, imm)                       \
1293     __extension__({                                      \
1294         const uint16_t _mask[8] = {                      \
1295             ((imm) & (1 << 0)) ? 0xFFFF : 0x0000,        \
1296             ((imm) & (1 << 1)) ? 0xFFFF : 0x0000,        \
1297             ((imm) & (1 << 2)) ? 0xFFFF : 0x0000,        \
1298             ((imm) & (1 << 3)) ? 0xFFFF : 0x0000,        \
1299             ((imm) & (1 << 4)) ? 0xFFFF : 0x0000,        \
1300             ((imm) & (1 << 5)) ? 0xFFFF : 0x0000,        \
1301             ((imm) & (1 << 6)) ? 0xFFFF : 0x0000,        \
1302             ((imm) & (1 << 7)) ? 0xFFFF : 0x0000         \
1303         };                                               \
1304         uint16x8_t _mask_vec = vld1q_u16(_mask);         \
1305         uint16x8_t _a = vreinterpretq_u16_m128i(a);      \
1306         uint16x8_t _b = vreinterpretq_u16_m128i(b);      \
1307         vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \
1308     })
1309
1310 // Blend packed 8-bit integers from a and b using mask, and store the results in dst.
1311 //
1312 //   FOR j := 0 to 15
1313 //       i := j*8
1314 //       IF mask[i+7]
1315 //           dst[i+7:i] := b[i+7:i]
1316 //       ELSE
1317 //           dst[i+7:i] := a[i+7:i]
1318 //       FI
1319 //   ENDFOR
1320 FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
1321 {
1322     // Use a signed shift right to create a mask with the sign bit
1323     uint8x16_t mask = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
1324     uint8x16_t a = vreinterpretq_u8_m128i(_a);
1325     uint8x16_t b = vreinterpretq_u8_m128i(_b);
1326     return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
1327 }
1328
1329 /////////////////////////////////////
1330 // Shifts
1331 /////////////////////////////////////
1332
1333 // Shifts the 4 signed 32-bit integers in a right by count bits while shifting
1334 // in the sign bit.
1335 //
1336 //   r0 := a0 >> count
1337 //   r1 := a1 >> count
1338 //   r2 := a2 >> count
1339 //   r3 := a3 >> count immediate
1340 FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, int count)
1341 {
1342     return (__m128i) vshlq_s32((int32x4_t) a, vdupq_n_s32(-count));
1343 }
1344
1345 // Shifts the 8 signed 16-bit integers in a right by count bits while shifting
1346 // in the sign bit.
1347 //
1348 //   r0 := a0 >> count
1349 //   r1 := a1 >> count
1350 //   ...
1351 //   r7 := a7 >> count
1352 FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int count)
1353 {
1354     return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
1355 }
1356
1357 // Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
1358 // shifting in zeros.
1359 //
1360 //   r0 := a0 << count
1361 //   r1 := a1 << count
1362 //   ...
1363 //   r7 := a7 << count
1364 //
1365 // https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
1366 #define _mm_slli_epi16(a, imm)                                   \
1367     __extension__({                                              \
1368         __m128i ret;                                             \
1369         if ((imm) <= 0) {                                        \
1370             ret = a;                                             \
1371         } else if ((imm) > 31) {                                 \
1372             ret = _mm_setzero_si128();                           \
1373         } else {                                                 \
1374             ret = vreinterpretq_m128i_s16(                       \
1375                 vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
1376         }                                                        \
1377         ret;                                                     \
1378     })
1379
1380 // Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
1381 // shifting in zeros. :
1382 // https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
1383 // FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
1384 #define _mm_slli_epi32(a, imm)                                   \
1385     __extension__({                                              \
1386         __m128i ret;                                             \
1387         if ((imm) <= 0) {                                        \
1388             ret = a;                                             \
1389         } else if ((imm) > 31) {                                 \
1390             ret = _mm_setzero_si128();                           \
1391         } else {                                                 \
1392             ret = vreinterpretq_m128i_s32(                       \
1393                 vshlq_n_s32(vreinterpretq_s32_m128i(a), (imm))); \
1394         }                                                        \
1395         ret;                                                     \
1396     })
1397
1398 // Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
1399 // store the results in dst.
1400 #define _mm_slli_epi64(a, imm)                                   \
1401     __extension__({                                              \
1402         __m128i ret;                                             \
1403         if ((imm) <= 0) {                                        \
1404             ret = a;                                             \
1405         } else if ((imm) > 63) {                                 \
1406             ret = _mm_setzero_si128();                           \
1407         } else {                                                 \
1408             ret = vreinterpretq_m128i_s64(                       \
1409                 vshlq_n_s64(vreinterpretq_s64_m128i(a), (imm))); \
1410         }                                                        \
1411         ret;                                                     \
1412     })
1413
1414 // Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
1415 // while shifting in zeros.
1416 //
1417 //   r0 := srl(a0, count)
1418 //   r1 := srl(a1, count)
1419 //   ...
1420 //   r7 := srl(a7, count)
1421 //
1422 // https://msdn.microsoft.com/en-us/library/6tcwd38t(v=vs.90).aspx
1423 #define _mm_srli_epi16(a, imm)                                   \
1424     __extension__({                                              \
1425         __m128i ret;                                             \
1426         if ((imm) <= 0) {                                        \
1427             ret = a;                                             \
1428         } else if ((imm) > 31) {                                 \
1429             ret = _mm_setzero_si128();                           \
1430         } else {                                                 \
1431             ret = vreinterpretq_m128i_u16(                       \
1432                 vshrq_n_u16(vreinterpretq_u16_m128i(a), (imm))); \
1433         }                                                        \
1434         ret;                                                     \
1435     })
1436
1437 // Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
1438 // while shifting in zeros.
1439 // https://msdn.microsoft.com/en-us/library/w486zcfa(v=vs.100).aspx FORCE_INLINE
1440 // __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
1441 #define _mm_srli_epi32(a, imm)                                   \
1442     __extension__({                                              \
1443         __m128i ret;                                             \
1444         if ((imm) <= 0) {                                        \
1445             ret = a;                                             \
1446         } else if ((imm) > 31) {                                 \
1447             ret = _mm_setzero_si128();                           \
1448         } else {                                                 \
1449             ret = vreinterpretq_m128i_u32(                       \
1450                 vshrq_n_u32(vreinterpretq_u32_m128i(a), (imm))); \
1451         }                                                        \
1452         ret;                                                     \
1453     })
1454
1455 // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
1456 // store the results in dst.
1457 #define _mm_srli_epi64(a, imm)                                   \
1458     __extension__({                                              \
1459         __m128i ret;                                             \
1460         if ((imm) <= 0) {                                        \
1461             ret = a;                                             \
1462         } else if ((imm) > 63) {                                 \
1463             ret = _mm_setzero_si128();                           \
1464         } else {                                                 \
1465             ret = vreinterpretq_m128i_u64(                       \
1466                 vshrq_n_u64(vreinterpretq_u64_m128i(a), (imm))); \
1467         }                                                        \
1468         ret;                                                     \
1469     })
1470
1471 // Shifts the 4 signed 32 - bit integers in a right by count bits while shifting
1472 // in the sign bit.
1473 // https://msdn.microsoft.com/en-us/library/z1939387(v=vs.100).aspx
1474 // FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
1475 #define _mm_srai_epi32(a, imm)                                   \
1476     __extension__({                                              \
1477         __m128i ret;                                             \
1478         if ((imm) <= 0) {                                        \
1479             ret = a;                                             \
1480         } else if ((imm) > 31) {                                 \
1481             ret = vreinterpretq_m128i_s32(                       \
1482                 vshrq_n_s32(vreinterpretq_s32_m128i(a), 16));    \
1483             ret = vreinterpretq_m128i_s32(                       \
1484                 vshrq_n_s32(vreinterpretq_s32_m128i(ret), 16));  \
1485         } else {                                                 \
1486             ret = vreinterpretq_m128i_s32(                       \
1487                 vshrq_n_s32(vreinterpretq_s32_m128i(a), (imm))); \
1488         }                                                        \
1489         ret;                                                     \
1490     })
1491
1492 // Shifts the 128 - bit value in a right by imm bytes while shifting in
1493 // zeros.imm must be an immediate.
1494 //
1495 //   r := srl(a, imm*8)
1496 //
1497 // https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
1498 // FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
1499 #define _mm_srli_si128(a, imm)                                              \
1500     __extension__({                                                         \
1501         __m128i ret;                                                        \
1502         if ((imm) <= 0) {                                                   \
1503             ret = a;                                                        \
1504         } else if ((imm) > 15) {                                            \
1505             ret = _mm_setzero_si128();                                      \
1506         } else {                                                            \
1507             ret = vreinterpretq_m128i_s8(                                   \
1508                 vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
1509         }                                                                   \
1510         ret;                                                                \
1511     })
1512
1513 // Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
1514 // must be an immediate.
1515 //
1516 //   r := a << (imm * 8)
1517 //
1518 // https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
1519 // FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
1520 #define _mm_slli_si128(a, imm)                                          \
1521     __extension__({                                                     \
1522         __m128i ret;                                                    \
1523         if ((imm) <= 0) {                                               \
1524             ret = a;                                                    \
1525         } else if ((imm) > 15) {                                        \
1526             ret = _mm_setzero_si128();                                  \
1527         } else {                                                        \
1528             ret = vreinterpretq_m128i_s8(vextq_s8(                      \
1529                 vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
1530         }                                                               \
1531         ret;                                                            \
1532     })
1533
1534 // Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
1535 // shifting in zeros.
1536 //
1537 //   r0 := a0 << count
1538 //   r1 := a1 << count
1539 //   ...
1540 //   r7 := a7 << count
1541 //
1542 // https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
1543 FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
1544 {
1545     uint64_t c = ((SIMDVec *) &count)->m128_u64[0];
1546     if (c > 15)
1547         return _mm_setzero_si128();
1548
1549     int16x8_t vc = vdupq_n_s16((int16_t) c);
1550     return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
1551 }
1552
1553 // NEON does not provide a version of this function.
1554 // Creates a 16-bit mask from the most significant bits of the 16 signed or
1555 // unsigned 8-bit integers in a and zero extends the upper bits.
1556 // https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
1557 FORCE_INLINE int _mm_movemask_epi8(__m128i a)
1558 {
1559     // Use increasingly wide shifts+adds to collect the sign bits
1560     // together.
1561     // Since the widening shifts would be rather confusing to follow in little endian, everything
1562     // will be illustrated in big endian order instead. This has a different result - the bits
1563     // would actually be reversed on a big endian machine.
1564
1565     // Starting input (only half the elements are shown):
1566     // 89 ff 1d c0 00 10 99 33
1567     uint8x16_t input = vreinterpretq_u8_m128i(a);
1568
1569     // Shift out everything but the sign bits with an unsigned shift right.
1570     //
1571     // Bytes of the vector::
1572     // 89 ff 1d c0 00 10 99 33
1573     // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
1574     //  |  |  |  |  |  |  |  |
1575     // 01 01 00 01 00 00 01 00
1576     //
1577     // Bits of first important lane(s):
1578     // 10001001 (89)
1579     // \______
1580     //        |
1581     // 00000001 (01)
1582     uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
1583
1584     // Merge the even lanes together with a 16-bit unsigned shift right + add.
1585     // 'xx' represents garbage data which will be ignored in the final result.
1586     // In the important bytes, the add functions like a binary OR.
1587     //
1588     // 01 01 00 01 00 00 01 00
1589     //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
1590     //    \|    \|    \|    \|
1591     // xx 03 xx 01 xx 00 xx 02
1592     //
1593     // 00000001 00000001 (01 01)
1594     //        \_______ |
1595     //                \|
1596     // xxxxxxxx xxxxxx11 (xx 03)
1597     uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
1598
1599     // Repeat with a wider 32-bit shift + add.
1600     // xx 03 xx 01 xx 00 xx 02
1601     //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >> 14))
1602     //          \|          \|
1603     // xx xx xx 0d xx xx xx 02
1604     //
1605     // 00000011 00000001 (03 01)
1606     //        \\_____ ||
1607     //         '----.\||
1608     // xxxxxxxx xxxx1101 (xx 0d)
1609     uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
1610
1611     // Last, an even wider 64-bit shift + add to get our result in the low 8 bit lanes.
1612     // xx xx xx 0d xx xx xx 02
1613     //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >> 28))
1614     //                      \|
1615     // xx xx xx xx xx xx xx d2
1616     //
1617     // 00001101 00000010 (0d 02)
1618     //     \   \___ |  |
1619     //      '---.  \|  |
1620     // xxxxxxxx 11010010 (xx d2)
1621     uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
1622
1623     // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
1624     // xx xx xx xx xx xx xx d2
1625     //                      ||  return paired64[0]
1626     //                      d2
1627     // Note: Little endian would return the correct value 4b (01001011) instead.
1628     return vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8);
1629 }
1630
1631 // NEON does not provide this method
1632 // Creates a 4-bit mask from the most significant bits of the four
1633 // single-precision, floating-point values.
1634 // https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
1635 FORCE_INLINE int _mm_movemask_ps(__m128 a)
1636 {
1637     // Uses the exact same method as _mm_movemask_epi8, see that for details
1638     uint32x4_t input = vreinterpretq_u32_m128(a);
1639     // Shift out everything but the sign bits with a 32-bit unsigned shift right.
1640     uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
1641     // Merge the two pairs together with a 64-bit unsigned shift right + add.
1642     uint8x16_t paired = vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
1643     // Extract the result.
1644     return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
1645 }
1646
1647 // Compute the bitwise AND of 128 bits (representing integer data) in a and
1648 // mask, and return 1 if the result is zero, otherwise return 0.
1649 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros&expand=5871
1650 FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
1651 {
1652     int64x2_t a_and_mask =
1653         vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
1654     return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0
1655                                                                            : 1;
1656 }
1657
1658 // ******************************************
1659 // Math operations
1660 // ******************************************
1661
1662 // Subtracts the four single-precision, floating-point values of a and b.
1663 //
1664 //   r0 := a0 - b0
1665 //   r1 := a1 - b1
1666 //   r2 := a2 - b2
1667 //   r3 := a3 - b3
1668 //
1669 // https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
1670 FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
1671 {
1672     return vreinterpretq_m128_f32(
1673         vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1674 }
1675
1676 // Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
1677 // and store the results in dst.
1678 //    r0 := a0 - b0
1679 //    r1 := a1 - b1
1680 FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
1681 {
1682     return vreinterpretq_m128i_s64(
1683         vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
1684 }
1685
1686 // Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
1687 // unsigned 32-bit integers of a.
1688 //
1689 //   r0 := a0 - b0
1690 //   r1 := a1 - b1
1691 //   r2 := a2 - b2
1692 //   r3 := a3 - b3
1693 //
1694 // https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
1695 FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
1696 {
1697     return vreinterpretq_m128i_s32(
1698         vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1699 }
1700
1701 FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
1702 {
1703     return vreinterpretq_m128i_s16(
1704         vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
1705 }
1706
1707 FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
1708 {
1709     return vreinterpretq_m128i_s8(
1710         vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
1711 }
1712
1713 // Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
1714 // integers of a and saturates..
1715 // https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
1716 FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
1717 {
1718     return vreinterpretq_m128i_u16(
1719         vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
1720 }
1721
1722 // Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
1723 // integers of a and saturates.
1724 //
1725 //   r0 := UnsignedSaturate(a0 - b0)
1726 //   r1 := UnsignedSaturate(a1 - b1)
1727 //   ...
1728 //   r15 := UnsignedSaturate(a15 - b15)
1729 //
1730 // https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
1731 FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
1732 {
1733     return vreinterpretq_m128i_u8(
1734         vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
1735 }
1736
1737 // Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
1738 // of a and saturates.
1739 //
1740 //   r0 := SignedSaturate(a0 - b0)
1741 //   r1 := SignedSaturate(a1 - b1)
1742 //   ...
1743 //   r7 := SignedSaturate(a7 - b7)
1744 FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
1745 {
1746     return vreinterpretq_m128i_s16(
1747         vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
1748 }
1749
1750 FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
1751 {
1752     return vreinterpretq_m128i_u16(
1753         vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
1754 }
1755
1756 // Negate packed 8-bit integers in a when the corresponding signed
1757 // 8-bit integer in b is negative, and store the results in dst.
1758 // Element in dst are zeroed out when the corresponding element
1759 // in b is zero.
1760 //
1761 //   for i in 0..15
1762 //     if b[i] < 0
1763 //       r[i] := -a[i]
1764 //     else if b[i] == 0
1765 //       r[i] := 0
1766 //     else
1767 //       r[i] := a[i]
1768 //     fi
1769 //   done
1770 FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
1771 {
1772     int8x16_t a = vreinterpretq_s8_m128i(_a);
1773     int8x16_t b = vreinterpretq_s8_m128i(_b);
1774
1775     int8x16_t zero = vdupq_n_s8(0);
1776     // signed shift right: faster than vclt
1777     // (b < 0) ? 0xFF : 0
1778     uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
1779     // (b == 0) ? 0xFF : 0
1780     int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, zero));
1781     // -a
1782     int8x16_t neg = vnegq_s8(a);
1783     // bitwise select either a or neg based on ltMask
1784     int8x16_t masked = vbslq_s8(ltMask, a, neg);
1785     // res = masked & (~zeroMask)
1786     int8x16_t res = vbicq_s8(masked, zeroMask);
1787     return vreinterpretq_m128i_s8(res);
1788 }
1789
1790 // Negate packed 16-bit integers in a when the corresponding signed
1791 // 16-bit integer in b is negative, and store the results in dst.
1792 // Element in dst are zeroed out when the corresponding element
1793 // in b is zero.
1794 //
1795 //   for i in 0..7
1796 //     if b[i] < 0
1797 //       r[i] := -a[i]
1798 //     else if b[i] == 0
1799 //       r[i] := 0
1800 //     else
1801 //       r[i] := a[i]
1802 //     fi
1803 //   done
1804 FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
1805 {
1806     int16x8_t a = vreinterpretq_s16_m128i(_a);
1807     int16x8_t b = vreinterpretq_s16_m128i(_b);
1808
1809     int16x8_t zero = vdupq_n_s16(0);
1810     // signed shift right: faster than vclt
1811     // (b < 0) ? 0xFFFF : 0
1812     uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
1813     // (b == 0) ? 0xFFFF : 0
1814     int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, zero));
1815     // -a
1816     int16x8_t neg = vnegq_s16(a);
1817     // bitwise select either a or neg based on ltMask
1818     int16x8_t masked = vbslq_s16(ltMask, a, neg);
1819     // res = masked & (~zeroMask)
1820     int16x8_t res = vbicq_s16(masked, zeroMask);
1821     return vreinterpretq_m128i_s16(res);
1822 }
1823
1824 // Negate packed 32-bit integers in a when the corresponding signed
1825 // 32-bit integer in b is negative, and store the results in dst.
1826 // Element in dst are zeroed out when the corresponding element
1827 // in b is zero.
1828 //
1829 //   for i in 0..3
1830 //     if b[i] < 0
1831 //       r[i] := -a[i]
1832 //     else if b[i] == 0
1833 //       r[i] := 0
1834 //     else
1835 //       r[i] := a[i]
1836 //     fi
1837 //   done
1838 FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
1839 {
1840     int32x4_t a = vreinterpretq_s32_m128i(_a);
1841     int32x4_t b = vreinterpretq_s32_m128i(_b);
1842
1843     int32x4_t zero = vdupq_n_s32(0);
1844     // signed shift right: faster than vclt
1845     // (b < 0) ? 0xFFFFFFFF : 0
1846     uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
1847     // (b == 0) ? 0xFFFFFFFF : 0
1848     int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, zero));
1849     // neg = -a
1850     int32x4_t neg = vnegq_s32(a);
1851     // bitwise select either a or neg based on ltMask
1852     int32x4_t masked = vbslq_s32(ltMask, a, neg);
1853     // res = masked & (~zeroMask)
1854     int32x4_t res = vbicq_s32(masked, zeroMask);
1855     return vreinterpretq_m128i_s32(res);
1856 }
1857
1858 // Computes the average of the 16 unsigned 8-bit integers in a and the 16
1859 // unsigned 8-bit integers in b and rounds.
1860 //
1861 //   r0 := (a0 + b0) / 2
1862 //   r1 := (a1 + b1) / 2
1863 //   ...
1864 //   r15 := (a15 + b15) / 2
1865 //
1866 // https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
1867 FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
1868 {
1869     return vreinterpretq_m128i_u8(
1870         vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
1871 }
1872
1873 // Computes the average of the 8 unsigned 16-bit integers in a and the 8
1874 // unsigned 16-bit integers in b and rounds.
1875 //
1876 //   r0 := (a0 + b0) / 2
1877 //   r1 := (a1 + b1) / 2
1878 //   ...
1879 //   r7 := (a7 + b7) / 2
1880 //
1881 // https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
1882 FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
1883 {
1884     return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
1885                                  vreinterpretq_u16_m128i(b));
1886 }
1887
1888 // Adds the four single-precision, floating-point values of a and b.
1889 //
1890 //   r0 := a0 + b0
1891 //   r1 := a1 + b1
1892 //   r2 := a2 + b2
1893 //   r3 := a3 + b3
1894 //
1895 // https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
1896 FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
1897 {
1898     return vreinterpretq_m128_f32(
1899         vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1900 }
1901
1902 // adds the scalar single-precision floating point values of a and b.
1903 // https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
1904 FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
1905 {
1906     float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
1907     float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
1908     // the upper values in the result must be the remnants of <a>.
1909     return vreinterpretq_m128_f32(vaddq_f32(a, value));
1910 }
1911
1912 // Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
1913 // unsigned 32-bit integers in b.
1914 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
1915 FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
1916 {
1917     return vreinterpretq_m128i_s64(
1918         vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
1919 }
1920
1921 // Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
1922 // unsigned 32-bit integers in b.
1923 //
1924 //   r0 := a0 + b0
1925 //   r1 := a1 + b1
1926 //   r2 := a2 + b2
1927 //   r3 := a3 + b3
1928 //
1929 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
1930 FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
1931 {
1932     return vreinterpretq_m128i_s32(
1933         vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1934 }
1935
1936 // Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
1937 // unsigned 16-bit integers in b.
1938 // https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
1939 FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
1940 {
1941     return vreinterpretq_m128i_s16(
1942         vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
1943 }
1944
1945 // Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
1946 // unsigned 8-bit integers in b.
1947 // https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
1948 FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
1949 {
1950     return vreinterpretq_m128i_s8(
1951         vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
1952 }
1953
1954 // Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
1955 // and saturates.
1956 //
1957 //   r0 := SignedSaturate(a0 + b0)
1958 //   r1 := SignedSaturate(a1 + b1)
1959 //   ...
1960 //   r7 := SignedSaturate(a7 + b7)
1961 //
1962 // https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
1963 FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
1964 {
1965     return vreinterpretq_m128i_s16(
1966         vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
1967 }
1968
1969 // Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
1970 // b and saturates..
1971 // https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
1972 FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
1973 {
1974     return vreinterpretq_m128i_u8(
1975         vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
1976 }
1977
1978 // Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
1979 // unsigned 16-bit integers from b.
1980 //
1981 //   r0 := (a0 * b0)[15:0]
1982 //   r1 := (a1 * b1)[15:0]
1983 //   ...
1984 //   r7 := (a7 * b7)[15:0]
1985 //
1986 // https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
1987 FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
1988 {
1989     return vreinterpretq_m128i_s16(
1990         vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
1991 }
1992
1993 // Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
1994 // unsigned 32-bit integers from b.
1995 // https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
1996 FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
1997 {
1998     return vreinterpretq_m128i_s32(
1999         vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2000 }
2001
2002 // Multiplies the four single-precision, floating-point values of a and b.
2003 //
2004 //   r0 := a0 * b0
2005 //   r1 := a1 * b1
2006 //   r2 := a2 * b2
2007 //   r3 := a3 * b3
2008 //
2009 // https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
2010 FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
2011 {
2012     return vreinterpretq_m128_f32(
2013         vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2014 }
2015
2016 // Multiply the low unsigned 32-bit integers from each packed 64-bit element in
2017 // a and b, and store the unsigned 64-bit results in dst.
2018 //
2019 //   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
2020 //   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
2021 FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
2022 {
2023     // vmull_u32 upcasts instead of masking, so we downcast.
2024     uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
2025     uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
2026     return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
2027 }
2028
2029 // Multiply the low signed 32-bit integers from each packed 64-bit element in
2030 // a and b, and store the signed 64-bit results in dst.
2031 //
2032 //   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
2033 //   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
2034 FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
2035 {
2036     // vmull_s32 upcasts instead of masking, so we downcast.
2037     int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
2038     int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
2039     return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
2040 }
2041
2042 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
2043 // integers from b.
2044 //
2045 //   r0 := (a0 * b0) + (a1 * b1)
2046 //   r1 := (a2 * b2) + (a3 * b3)
2047 //   r2 := (a4 * b4) + (a5 * b5)
2048 //   r3 := (a6 * b6) + (a7 * b7)
2049 // https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
2050 FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
2051 {
2052     int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
2053                               vget_low_s16(vreinterpretq_s16_m128i(b)));
2054     int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
2055                                vget_high_s16(vreinterpretq_s16_m128i(b)));
2056
2057     int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
2058     int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
2059
2060     return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
2061 }
2062
2063 // Multiply packed signed 16-bit integers in a and b, producing intermediate signed
2064 // 32-bit integers. Shift right by 15 bits while rounding up, and store the
2065 // packed 16-bit integers in dst.
2066 //
2067 //   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
2068 //   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
2069 //   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
2070 //   ...
2071 //   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
2072 FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
2073 {
2074     // Has issues due to saturation
2075     // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
2076
2077     // Multiply
2078     int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
2079                                  vget_low_s16(vreinterpretq_s16_m128i(b)));
2080     int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
2081                                  vget_high_s16(vreinterpretq_s16_m128i(b)));
2082
2083     // Rounding narrowing shift right
2084     // narrow = (int16_t)((mul + 16384) >> 15);
2085     int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
2086     int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
2087
2088     // Join together
2089     return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
2090 }
2091
2092 // Vertically multiply each unsigned 8-bit integer from a with the corresponding
2093 // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
2094 // Horizontally add adjacent pairs of intermediate signed 16-bit integers,
2095 // and pack the saturated results in dst.
2096 //
2097 //   FOR j := 0 to 7
2098 //      i := j*16
2099 //      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
2100 //   ENDFOR
2101 FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
2102 {
2103     // This would be much simpler if x86 would choose to zero extend OR sign extend,
2104     // not both.
2105     // This could probably be optimized better.
2106     uint16x8_t a = vreinterpretq_u16_m128i(_a);
2107     int16x8_t b = vreinterpretq_s16_m128i(_b);
2108
2109     // Zero extend a
2110     int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
2111     int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
2112
2113     // Sign extend by shifting left then shifting right.
2114     int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
2115     int16x8_t b_odd = vshrq_n_s16(b, 8);
2116
2117     // multiply
2118     int16x8_t prod1 = vmulq_s16(a_even, b_even);
2119     int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
2120
2121     // saturated add
2122     return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
2123 }
2124
2125 // Computes the absolute difference of the 16 unsigned 8-bit integers from a
2126 // and the 16 unsigned 8-bit integers from b.
2127 //
2128 // Return Value
2129 // Sums the upper 8 differences and lower 8 differences and packs the
2130 // resulting 2 unsigned 16-bit integers into the upper and lower 64-bit
2131 // elements.
2132 //
2133 //   r0 := abs(a0 - b0) + abs(a1 - b1) +...+ abs(a7 - b7)
2134 //   r1 := 0x0
2135 //   r2 := 0x0
2136 //   r3 := 0x0
2137 //   r4 := abs(a8 - b8) + abs(a9 - b9) +...+ abs(a15 - b15)
2138 //   r5 := 0x0
2139 //   r6 := 0x0
2140 //   r7 := 0x0
2141 FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
2142 {
2143     uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
2144     uint16_t r0 = t[0] + t[1] + t[2] + t[3];
2145     uint16_t r4 = t[4] + t[5] + t[6] + t[7];
2146     uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
2147     return (__m128i) vsetq_lane_u16(r4, r, 4);
2148 }
2149
2150 // Divides the four single-precision, floating-point values of a and b.
2151 //
2152 //   r0 := a0 / b0
2153 //   r1 := a1 / b1
2154 //   r2 := a2 / b2
2155 //   r3 := a3 / b3
2156 //
2157 // https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
2158 FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
2159 {
2160     float32x4_t recip0 = vrecpeq_f32(vreinterpretq_f32_m128(b));
2161     float32x4_t recip1 =
2162         vmulq_f32(recip0, vrecpsq_f32(recip0, vreinterpretq_f32_m128(b)));
2163     return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip1));
2164 }
2165
2166 // Divides the scalar single-precision floating point value of a by b.
2167 // https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
2168 FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
2169 {
2170     float32_t value =
2171         vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
2172     return vreinterpretq_m128_f32(
2173         vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2174 }
2175
2176 // This version does additional iterations to improve accuracy.  Between 1 and 4
2177 // recommended. Computes the approximations of reciprocals of the four
2178 // single-precision, floating-point values of a.
2179 // https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
2180 FORCE_INLINE __m128 recipq_newton(__m128 in, int n)
2181 {
2182     int i;
2183     float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
2184     for (i = 0; i < n; ++i) {
2185         recip =
2186             vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2187     }
2188     return vreinterpretq_m128_f32(recip);
2189 }
2190
2191 // Computes the approximations of reciprocals of the four single-precision,
2192 // floating-point values of a.
2193 // https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
2194 FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
2195 {
2196     float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
2197     recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2198     return vreinterpretq_m128_f32(recip);
2199 }
2200
2201 // Computes the approximations of square roots of the four single-precision,
2202 // floating-point values of a. First computes reciprocal square roots and then
2203 // reciprocals of the four values.
2204 //
2205 //   r0 := sqrt(a0)
2206 //   r1 := sqrt(a1)
2207 //   r2 := sqrt(a2)
2208 //   r3 := sqrt(a3)
2209 //
2210 // https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
2211 FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
2212 {
2213     float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2214     float32x4_t sq = vrecpeq_f32(recipsq);
2215     // ??? use step versions of both sqrt and recip for better accuracy?
2216     return vreinterpretq_m128_f32(sq);
2217 }
2218
2219 // Computes the approximation of the square root of the scalar single-precision
2220 // floating point value of in.
2221 // https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
2222 FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
2223 {
2224     float32_t value =
2225         vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
2226     return vreinterpretq_m128_f32(
2227         vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
2228 }
2229
2230 // Computes the approximations of the reciprocal square roots of the four
2231 // single-precision floating point values of in.
2232 // https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
2233 FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
2234 {
2235     return vreinterpretq_m128_f32(vrsqrteq_f32(vreinterpretq_f32_m128(in)));
2236 }
2237
2238 // Computes the maximums of the four single-precision, floating-point values of
2239 // a and b.
2240 // https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
2241 FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
2242 {
2243     return vreinterpretq_m128_f32(
2244         vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2245 }
2246
2247 // Computes the minima of the four single-precision, floating-point values of a
2248 // and b.
2249 // https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
2250 FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
2251 {
2252     return vreinterpretq_m128_f32(
2253         vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2254 }
2255
2256 // Computes the maximum of the two lower scalar single-precision floating point
2257 // values of a and b.
2258 // https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
2259 FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
2260 {
2261     float32_t value = vgetq_lane_f32(
2262         vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
2263     return vreinterpretq_m128_f32(
2264         vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2265 }
2266
2267 // Computes the minimum of the two lower scalar single-precision floating point
2268 // values of a and b.
2269 // https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
2270 FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
2271 {
2272     float32_t value = vgetq_lane_f32(
2273         vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
2274     return vreinterpretq_m128_f32(
2275         vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2276 }
2277
2278 // Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
2279 // 16 unsigned 8-bit integers from b.
2280 // https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
2281 FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
2282 {
2283     return vreinterpretq_m128i_u8(
2284         vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2285 }
2286
2287 // Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
2288 // 16 unsigned 8-bit integers from b.
2289 // https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
2290 FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
2291 {
2292     return vreinterpretq_m128i_u8(
2293         vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2294 }
2295
2296 // Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
2297 // signed 16-bit integers from b.
2298 // https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
2299 FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
2300 {
2301     return vreinterpretq_m128i_s16(
2302         vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2303 }
2304
2305 // Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
2306 // signed 16-bit integers from b.
2307 // https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
2308 FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
2309 {
2310     return vreinterpretq_m128i_s16(
2311         vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2312 }
2313
2314 // epi versions of min/max
2315 // Computes the pariwise maximums of the four signed 32-bit integer values of a
2316 // and b.
2317 //
2318 // A 128-bit parameter that can be defined with the following equations:
2319 //   r0 := (a0 > b0) ? a0 : b0
2320 //   r1 := (a1 > b1) ? a1 : b1
2321 //   r2 := (a2 > b2) ? a2 : b2
2322 //   r3 := (a3 > b3) ? a3 : b3
2323 //
2324 // https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
2325 FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
2326 {
2327     return vreinterpretq_m128i_s32(
2328         vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2329 }
2330
2331 // Computes the pariwise minima of the four signed 32-bit integer values of a
2332 // and b.
2333 //
2334 // A 128-bit parameter that can be defined with the following equations:
2335 //   r0 := (a0 < b0) ? a0 : b0
2336 //   r1 := (a1 < b1) ? a1 : b1
2337 //   r2 := (a2 < b2) ? a2 : b2
2338 //   r3 := (a3 < b3) ? a3 : b3
2339 //
2340 // https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
2341 FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
2342 {
2343     return vreinterpretq_m128i_s32(
2344         vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2345 }
2346
2347 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
2348 // integers from b.
2349 //
2350 //   r0 := (a0 * b0)[31:16]
2351 //   r1 := (a1 * b1)[31:16]
2352 //   ...
2353 //   r7 := (a7 * b7)[31:16]
2354 //
2355 // https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
2356 FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
2357 {
2358     /* FIXME: issue with large values because of result saturation */
2359     // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
2360     // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
2361     // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
2362     int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
2363     int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
2364     int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
2365     int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
2366     int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
2367     int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
2368     uint16x8x2_t r =
2369         vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
2370     return vreinterpretq_m128i_u16(r.val[1]);
2371 }
2372
2373 // Computes pairwise add of each argument as single-precision, floating-point
2374 // values a and b.
2375 // https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
2376 FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
2377 {
2378 #if defined(__aarch64__)
2379     return vreinterpretq_m128_f32(vpaddq_f32(
2380         vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));  // AArch64
2381 #else
2382     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
2383     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
2384     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
2385     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
2386     return vreinterpretq_m128_f32(
2387         vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
2388 #endif
2389 }
2390
2391 // Computes pairwise add of each argument as a 16-bit signed or unsigned integer
2392 // values a and b.
2393 FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
2394 {
2395     int16x8_t a = vreinterpretq_s16_m128i(_a);
2396     int16x8_t b = vreinterpretq_s16_m128i(_b);
2397 #if defined(__aarch64__)
2398     return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
2399 #else
2400     return vreinterpretq_m128i_s16(
2401        vcombine_s16(
2402           vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
2403           vpadd_s16(vget_low_s16(b), vget_high_s16(b))
2404        )
2405     );
2406 #endif
2407 }
2408
2409 // Computes pairwise difference of each argument as a 16-bit signed or unsigned integer
2410 // values a and b.
2411 FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
2412 {
2413     int32x4_t a = vreinterpretq_s32_m128i(_a);
2414     int32x4_t b = vreinterpretq_s32_m128i(_b);
2415     // Interleave using vshrn/vmovn
2416     // [a0|a2|a4|a6|b0|b2|b4|b6]
2417     // [a1|a3|a5|a7|b1|b3|b5|b7]
2418     int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
2419     int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
2420     // Subtract
2421     return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
2422 }
2423
2424 // Computes saturated pairwise sub of each argument as a 16-bit signed
2425 // integer values a and b.
2426 FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
2427 {
2428     int32x4_t a = vreinterpretq_s32_m128i(_a);
2429     int32x4_t b = vreinterpretq_s32_m128i(_b);
2430     // Interleave using vshrn/vmovn
2431     // [a0|a2|a4|a6|b0|b2|b4|b6]
2432     // [a1|a3|a5|a7|b1|b3|b5|b7]
2433     int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
2434     int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
2435     // Saturated add
2436     return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
2437 }
2438
2439 // Computes saturated pairwise difference of each argument as a 16-bit signed
2440 // integer values a and b.
2441 FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
2442 {
2443     int32x4_t a = vreinterpretq_s32_m128i(_a);
2444     int32x4_t b = vreinterpretq_s32_m128i(_b);
2445     // Interleave using vshrn/vmovn
2446     // [a0|a2|a4|a6|b0|b2|b4|b6]
2447     // [a1|a3|a5|a7|b1|b3|b5|b7]
2448     int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
2449     int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
2450     // Saturated subtract
2451     return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
2452 }
2453
2454 // Computes pairwise add of each argument as a 32-bit signed or unsigned integer
2455 // values a and b.
2456 FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
2457 {
2458     int32x4_t a = vreinterpretq_s32_m128i(_a);
2459     int32x4_t b = vreinterpretq_s32_m128i(_b);
2460     return vreinterpretq_m128i_s32(
2461        vcombine_s32(
2462           vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
2463           vpadd_s32(vget_low_s32(b), vget_high_s32(b))
2464        )
2465     );
2466 }
2467
2468 // Computes pairwise difference of each argument as a 32-bit signed or unsigned integer
2469 // values a and b.
2470 FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
2471 {
2472     int64x2_t a = vreinterpretq_s64_m128i(_a);
2473     int64x2_t b = vreinterpretq_s64_m128i(_b);
2474     // Interleave using vshrn/vmovn
2475     // [a0|a2|b0|b2]
2476     // [a1|a2|b1|b3]
2477     int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
2478     int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
2479     // Subtract
2480     return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
2481 }
2482
2483 // ******************************************
2484 // Compare operations
2485 // ******************************************
2486
2487 // Compares for less than
2488 // https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
2489 FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
2490 {
2491     return vreinterpretq_m128_u32(
2492         vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2493 }
2494
2495 // Compares for greater than.
2496 //
2497 //   r0 := (a0 > b0) ? 0xffffffff : 0x0
2498 //   r1 := (a1 > b1) ? 0xffffffff : 0x0
2499 //   r2 := (a2 > b2) ? 0xffffffff : 0x0
2500 //   r3 := (a3 > b3) ? 0xffffffff : 0x0
2501 //
2502 // https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
2503 FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
2504 {
2505     return vreinterpretq_m128_u32(
2506         vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2507 }
2508
2509 // Compares for greater than or equal.
2510 // https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
2511 FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
2512 {
2513     return vreinterpretq_m128_u32(
2514         vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2515 }
2516
2517 // Compares for less than or equal.
2518 //
2519 //   r0 := (a0 <= b0) ? 0xffffffff : 0x0
2520 //   r1 := (a1 <= b1) ? 0xffffffff : 0x0
2521 //   r2 := (a2 <= b2) ? 0xffffffff : 0x0
2522 //   r3 := (a3 <= b3) ? 0xffffffff : 0x0
2523 //
2524 // https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
2525 FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
2526 {
2527     return vreinterpretq_m128_u32(
2528         vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2529 }
2530
2531 // Compares for equality.
2532 // https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
2533 FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
2534 {
2535     return vreinterpretq_m128_u32(
2536         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2537 }
2538
2539 // Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
2540 // unsigned 8-bit integers in b for equality.
2541 // https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
2542 FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
2543 {
2544     return vreinterpretq_m128i_u8(
2545         vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2546 }
2547
2548 // Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
2549 // unsigned 16-bit integers in b for equality.
2550 // https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
2551 FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
2552 {
2553     return vreinterpretq_m128i_u16(
2554         vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2555 }
2556
2557 // Compare packed 32-bit integers in a and b for equality, and store the results
2558 // in dst
2559 FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
2560 {
2561     return vreinterpretq_m128i_u32(
2562         vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2563 }
2564
2565 // Compare packed 64-bit integers in a and b for equality, and store the results
2566 // in dst
2567 FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
2568 {
2569 #if defined(__aarch64__)
2570     return vreinterpretq_m128i_u64(
2571         vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
2572 #else
2573     // ARMv7 lacks vceqq_u64
2574     // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
2575     uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
2576     uint32x4_t swapped = vrev64q_u32(cmp);
2577     return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
2578 #endif
2579 }
2580
2581 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
2582 // in b for lesser than.
2583 // https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
2584 FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
2585 {
2586     return vreinterpretq_m128i_u8(
2587         vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2588 }
2589
2590 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
2591 // in b for greater than.
2592 //
2593 //   r0 := (a0 > b0) ? 0xff : 0x0
2594 //   r1 := (a1 > b1) ? 0xff : 0x0
2595 //   ...
2596 //   r15 := (a15 > b15) ? 0xff : 0x0
2597 //
2598 // https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
2599 FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
2600 {
2601     return vreinterpretq_m128i_u8(
2602         vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2603 }
2604
2605 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
2606 // in b for less than.
2607 //
2608 //   r0 := (a0 < b0) ? 0xffff : 0x0
2609 //   r1 := (a1 < b1) ? 0xffff : 0x0
2610 //   ...
2611 //   r7 := (a7 < b7) ? 0xffff : 0x0
2612 //
2613 // https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
2614 FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
2615 {
2616     return vreinterpretq_m128i_u16(
2617         vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2618 }
2619
2620 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
2621 // in b for greater than.
2622 //
2623 //   r0 := (a0 > b0) ? 0xffff : 0x0
2624 //   r1 := (a1 > b1) ? 0xffff : 0x0
2625 //   ...
2626 //   r7 := (a7 > b7) ? 0xffff : 0x0
2627 //
2628 // https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
2629 FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
2630 {
2631     return vreinterpretq_m128i_u16(
2632         vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2633 }
2634
2635
2636 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
2637 // in b for less than.
2638 // https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
2639 FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
2640 {
2641     return vreinterpretq_m128i_u32(
2642         vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2643 }
2644
2645 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
2646 // in b for greater than.
2647 // https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
2648 FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
2649 {
2650     return vreinterpretq_m128i_u32(
2651         vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2652 }
2653
2654 // Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
2655 // in b for greater than.
2656 FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
2657 {
2658 #if defined(__aarch64__)
2659     return vreinterpretq_m128i_u64(
2660         vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2661 #else
2662     // ARMv7 lacks vcgtq_s64.
2663     // This is based off of Clang's SSE2 polyfill:
2664     // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi))
2665
2666     // Mask the sign bit out since we need a signed AND an unsigned comparison
2667     // and it is ugly to try and split them.
2668     int32x4_t mask   = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull));
2669     int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask);
2670     int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask);
2671     // Check if a > b
2672     int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask));
2673     // Copy upper mask to lower mask
2674     // a_hi > b_hi
2675     int64x2_t gt_hi = vshrq_n_s64(greater, 63);
2676     // Copy lower mask to upper mask
2677     // a_lo > b_lo
2678     int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32);
2679     // Compare for equality
2680     int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask));
2681     // Copy upper mask to lower mask
2682     // a_hi == b_hi
2683     int64x2_t eq_hi = vshrq_n_s64(equal, 63);
2684     // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi)
2685     int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi));
2686     return vreinterpretq_m128i_s64(ret);
2687 #endif
2688 }
2689 // Compares the four 32-bit floats in a and b to check if any values are NaN.
2690 // Ordered compare between each value returns true for "orderable" and false for
2691 // "not orderable" (NaN).
2692 // https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
2693 // also:
2694 // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
2695 // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
2696 FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
2697 {
2698     // Note: NEON does not have ordered compare builtin
2699     // Need to compare a eq a and b eq b to check for NaN
2700     // Do AND of results to get final
2701     uint32x4_t ceqaa =
2702         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
2703     uint32x4_t ceqbb =
2704         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
2705     return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
2706 }
2707
2708 // Compares the lower single-precision floating point scalar values of a and b
2709 // using a less than operation. :
2710 // https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
2711 // note!! The documentation on MSDN is incorrect!  If either of the values is a
2712 // NAN the docs say you will get a one, but in fact, it will return a zero!!
2713 FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
2714 {
2715     uint32x4_t a_not_nan =
2716         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
2717     uint32x4_t b_not_nan =
2718         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
2719     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
2720     uint32x4_t a_lt_b =
2721         vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
2722     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0;
2723 }
2724
2725 // Compares the lower single-precision floating point scalar values of a and b
2726 // using a greater than operation. :
2727 // https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
2728 FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
2729 {
2730     // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
2731     // vreinterpretq_f32_m128(b)), 0);
2732     uint32x4_t a_not_nan =
2733         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
2734     uint32x4_t b_not_nan =
2735         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
2736     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
2737     uint32x4_t a_gt_b =
2738         vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
2739     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
2740 }
2741
2742 // Compares the lower single-precision floating point scalar values of a and b
2743 // using a less than or equal operation. :
2744 // https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
2745 FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
2746 {
2747     // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
2748     // vreinterpretq_f32_m128(b)), 0);
2749     uint32x4_t a_not_nan =
2750         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
2751     uint32x4_t b_not_nan =
2752         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
2753     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
2754     uint32x4_t a_le_b =
2755         vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
2756     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0;
2757 }
2758
2759 // Compares the lower single-precision floating point scalar values of a and b
2760 // using a greater than or equal operation. :
2761 // https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
2762 FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
2763 {
2764     // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
2765     // vreinterpretq_f32_m128(b)), 0);
2766     uint32x4_t a_not_nan =
2767         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
2768     uint32x4_t b_not_nan =
2769         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
2770     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
2771     uint32x4_t a_ge_b =
2772         vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
2773     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
2774 }
2775
2776 // Compares the lower single-precision floating point scalar values of a and b
2777 // using an equality operation. :
2778 // https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
2779 FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
2780 {
2781     // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
2782     // vreinterpretq_f32_m128(b)), 0);
2783     uint32x4_t a_not_nan =
2784         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
2785     uint32x4_t b_not_nan =
2786         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
2787     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
2788     uint32x4_t a_eq_b =
2789         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
2790     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0;
2791 }
2792
2793 // Compares the lower single-precision floating point scalar values of a and b
2794 // using an inequality operation. :
2795 // https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
2796 FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
2797 {
2798     // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
2799     // vreinterpretq_f32_m128(b)), 0);
2800     uint32x4_t a_not_nan =
2801         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
2802     uint32x4_t b_not_nan =
2803         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
2804     uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
2805     uint32x4_t a_neq_b = vmvnq_u32(
2806         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2807     return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0;
2808 }
2809
2810 // according to the documentation, these intrinsics behave the same as the
2811 // non-'u' versions.  We'll just alias them here.
2812 #define _mm_ucomilt_ss _mm_comilt_ss
2813 #define _mm_ucomile_ss _mm_comile_ss
2814 #define _mm_ucomigt_ss _mm_comigt_ss
2815 #define _mm_ucomige_ss _mm_comige_ss
2816 #define _mm_ucomieq_ss _mm_comieq_ss
2817 #define _mm_ucomineq_ss _mm_comineq_ss
2818
2819 // ******************************************
2820 // Conversions
2821 // ******************************************
2822
2823 // Converts the four single-precision, floating-point values of a to signed
2824 // 32-bit integer values using truncate.
2825 // https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
2826 FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
2827 {
2828     return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
2829 }
2830
2831 // Converts the four signed 32-bit integer values of a to single-precision,
2832 // floating-point values
2833 // https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
2834 FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
2835 {
2836     return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
2837 }
2838
2839 // Converts the four unsigned 8-bit integers in the lower 16 bits to four
2840 // unsigned 32-bit integers.
2841 FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
2842 {
2843     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
2844     uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
2845     return vreinterpretq_m128i_u16(u16x8);
2846 }
2847
2848 // Converts the four unsigned 8-bit integers in the lower 32 bits to four
2849 // unsigned 32-bit integers.
2850 // https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
2851 FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
2852 {
2853     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
2854     uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
2855     uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
2856     return vreinterpretq_m128i_u32(u32x4);
2857 }
2858
2859 // Converts the two unsigned 8-bit integers in the lower 16 bits to two
2860 // unsigned 64-bit integers.
2861 FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
2862 {
2863     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
2864     uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
2865     uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
2866     uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
2867     return vreinterpretq_m128i_u64(u64x2);
2868 }
2869
2870 // Converts the four unsigned 8-bit integers in the lower 16 bits to four
2871 // unsigned 32-bit integers.
2872 FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
2873 {
2874     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
2875     int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
2876     return vreinterpretq_m128i_s16(s16x8);
2877 }
2878
2879 // Converts the four unsigned 8-bit integers in the lower 32 bits to four
2880 // unsigned 32-bit integers.
2881 FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
2882 {
2883     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
2884     int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
2885     int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
2886     return vreinterpretq_m128i_s32(s32x4);
2887 }
2888
2889 // Converts the two signed 8-bit integers in the lower 32 bits to four
2890 // signed 64-bit integers.
2891 FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
2892 {
2893     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
2894     int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
2895     int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
2896     int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
2897     return vreinterpretq_m128i_s64(s64x2);
2898 }
2899
2900 // Converts the four signed 16-bit integers in the lower 64 bits to four signed
2901 // 32-bit integers.
2902 FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
2903 {
2904     return vreinterpretq_m128i_s32(
2905         vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
2906 }
2907
2908 // Converts the two signed 16-bit integers in the lower 32 bits two signed
2909 // 32-bit integers.
2910 FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
2911 {
2912     int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
2913     int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
2914     int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
2915     return vreinterpretq_m128i_s64(s64x2);
2916 }
2917
2918 // Converts the four unsigned 16-bit integers in the lower 64 bits to four unsigned
2919 // 32-bit integers.
2920 FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
2921 {
2922     return vreinterpretq_m128i_u32(
2923         vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
2924 }
2925
2926 // Converts the two unsigned 16-bit integers in the lower 32 bits to two unsigned
2927 // 64-bit integers.
2928 FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
2929 {
2930     uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
2931     uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
2932     uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
2933     return vreinterpretq_m128i_u64(u64x2);
2934 }
2935
2936 // Converts the two unsigned 32-bit integers in the lower 64 bits to two unsigned
2937 // 64-bit integers.
2938 FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
2939 {
2940     return vreinterpretq_m128i_u64(
2941         vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
2942 }
2943
2944 // Converts the two signed 32-bit integers in the lower 64 bits to two signed
2945 // 64-bit integers.
2946 FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
2947 {
2948     return vreinterpretq_m128i_s64(
2949         vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
2950 }
2951
2952 // Converts the four single-precision, floating-point values of a to signed
2953 // 32-bit integer values.
2954 //
2955 //   r0 := (int) a0
2956 //   r1 := (int) a1
2957 //   r2 := (int) a2
2958 //   r3 := (int) a3
2959 //
2960 // https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
2961 // *NOTE*. The default rounding mode on SSE is 'round to even', which ArmV7-A
2962 // does not support! It is supported on ARMv8-A however.
2963 FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
2964 {
2965 #if defined(__aarch64__)
2966     return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
2967 #else
2968     uint32x4_t signmask = vdupq_n_u32(0x80000000);
2969     float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
2970                                  vdupq_n_f32(0.5f)); /* +/- 0.5 */
2971     int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
2972         vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
2973     int32x4_t r_trunc =
2974         vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
2975     int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
2976         vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
2977     int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
2978                                  vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
2979     float32x4_t delta = vsubq_f32(
2980         vreinterpretq_f32_m128(a),
2981         vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
2982     uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
2983     return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal));
2984 #endif
2985 }
2986
2987 // Moves the least significant 32 bits of a to a 32-bit integer.
2988 // https://msdn.microsoft.com/en-us/library/5z7a9642%28v=vs.90%29.aspx
2989 FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
2990 {
2991     return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
2992 }
2993
2994 // Extracts the low order 64-bit integer from the parameter.
2995 // https://msdn.microsoft.com/en-us/library/bb531384(v=vs.120).aspx
2996 FORCE_INLINE uint64_t _mm_cvtsi128_si64(__m128i a)
2997 {
2998     return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
2999 }
3000
3001 // Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
3002 // zero extending the upper bits.
3003 //
3004 //   r0 := a
3005 //   r1 := 0x0
3006 //   r2 := 0x0
3007 //   r3 := 0x0
3008 //
3009 // https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
3010 FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
3011 {
3012     return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
3013 }
3014
3015 // Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
3016 // zero extending the upper bits.
3017 //
3018 //   r0 := a
3019 //   r1 := 0x0
3020 FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
3021 {
3022     return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
3023 }
3024
3025 // Applies a type cast to reinterpret four 32-bit floating point values passed
3026 // in as a 128-bit parameter as packed 32-bit integers.
3027 // https://msdn.microsoft.com/en-us/library/bb514099.aspx
3028 FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
3029 {
3030     return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
3031 }
3032
3033 // Applies a type cast to reinterpret four 32-bit integers passed in as a
3034 // 128-bit parameter as packed 32-bit floating point values.
3035 // https://msdn.microsoft.com/en-us/library/bb514029.aspx
3036 FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
3037 {
3038     return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
3039 }
3040
3041 // Loads 128-bit value. :
3042 // https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
3043 FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
3044 {
3045     return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
3046 }
3047
3048 // Loads 128-bit value. :
3049 // https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
3050 FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
3051 {
3052     return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
3053 }
3054
3055 // _mm_lddqu_si128 functions the same as _mm_loadu_si128.
3056 #define _mm_lddqu_si128 _mm_loadu_si128
3057
3058 // ******************************************
3059 // Miscellaneous Operations
3060 // ******************************************
3061
3062
3063 // Shifts the 8 signed 16-bit integers in a right by count bits while shifting
3064 // in the sign bit.
3065 //
3066 //   r0 := a0 >> count
3067 //   r1 := a1 >> count
3068 //   ...
3069 //   r7 := a7 >> count
3070 //
3071 // https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
3072 FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
3073 {
3074     int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
3075     if (c > 15)
3076         return _mm_cmplt_epi16(a, _mm_setzero_si128());
3077     return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
3078 }
3079
3080 // Shifts the 4 signed 32-bit integers in a right by count bits while shifting
3081 // in the sign bit.
3082 //
3083 //   r0 := a0 >> count
3084 //   r1 := a1 >> count
3085 //   r2 := a2 >> count
3086 //   r3 := a3 >> count
3087 //
3088 // https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
3089 FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
3090 {
3091     int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
3092     if (c > 31)
3093         return _mm_cmplt_epi32(a, _mm_setzero_si128());
3094     return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
3095 }
3096
3097 // Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
3098 // saturates.
3099 // https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
3100 FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
3101 {
3102     return vreinterpretq_m128i_s8(
3103         vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
3104                     vqmovn_s16(vreinterpretq_s16_m128i(b))));
3105 }
3106
3107 // Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
3108 // integers and saturates.
3109 //
3110 //   r0 := UnsignedSaturate(a0)
3111 //   r1 := UnsignedSaturate(a1)
3112 //   ...
3113 //   r7 := UnsignedSaturate(a7)
3114 //   r8 := UnsignedSaturate(b0)
3115 //   r9 := UnsignedSaturate(b1)
3116 //   ...
3117 //   r15 := UnsignedSaturate(b7)
3118 //
3119 // https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
3120 FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
3121 {
3122     return vreinterpretq_m128i_u8(
3123         vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
3124                     vqmovun_s16(vreinterpretq_s16_m128i(b))));
3125 }
3126
3127 // Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
3128 // and saturates.
3129 //
3130 //   r0 := SignedSaturate(a0)
3131 //   r1 := SignedSaturate(a1)
3132 //   r2 := SignedSaturate(a2)
3133 //   r3 := SignedSaturate(a3)
3134 //   r4 := SignedSaturate(b0)
3135 //   r5 := SignedSaturate(b1)
3136 //   r6 := SignedSaturate(b2)
3137 //   r7 := SignedSaturate(b3)
3138 //
3139 // https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
3140 FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
3141 {
3142     return vreinterpretq_m128i_s16(
3143         vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
3144                      vqmovn_s32(vreinterpretq_s32_m128i(b))));
3145 }
3146
3147 // Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit integers
3148 // and saturates.
3149 //
3150 //   r0 := UnsignedSaturate(a0)
3151 //   r1 := UnsignedSaturate(a1)
3152 //   r2 := UnsignedSaturate(a2)
3153 //   r3 := UnsignedSaturate(a3)
3154 //   r4 := UnsignedSaturate(b0)
3155 //   r5 := UnsignedSaturate(b1)
3156 //   r6 := UnsignedSaturate(b2)
3157 //   r7 := UnsignedSaturate(b3)
3158 FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
3159 {
3160     return vreinterpretq_m128i_u16(
3161         vcombine_u16(vqmovn_u32(vreinterpretq_u32_m128i(a)),
3162                      vqmovn_u32(vreinterpretq_u32_m128i(b))));
3163 }
3164
3165 // Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
3166 // 8 signed or unsigned 8-bit integers in b.
3167 //
3168 //   r0 := a0
3169 //   r1 := b0
3170 //   r2 := a1
3171 //   r3 := b1
3172 //   ...
3173 //   r14 := a7
3174 //   r15 := b7
3175 //
3176 // https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
3177 FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
3178 {
3179 #if defined(__aarch64__)
3180     return vreinterpretq_m128i_s8(vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3181 #else
3182     int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
3183     int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
3184     int8x8x2_t result = vzip_s8(a1, b1);
3185     return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
3186 #endif
3187 }
3188
3189 // Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
3190 // lower 4 signed or unsigned 16-bit integers in b.
3191 //
3192 //   r0 := a0
3193 //   r1 := b0
3194 //   r2 := a1
3195 //   r3 := b1
3196 //   r4 := a2
3197 //   r5 := b2
3198 //   r6 := a3
3199 //   r7 := b3
3200 //
3201 // https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
3202 FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
3203 {
3204 #if defined(__aarch64__)
3205     return vreinterpretq_m128i_s16(vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3206 #else
3207     int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
3208     int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
3209     int16x4x2_t result = vzip_s16(a1, b1);
3210     return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
3211 #endif
3212 }
3213
3214 // Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
3215 // lower 2 signed or unsigned 32 - bit integers in b.
3216 //
3217 //   r0 := a0
3218 //   r1 := b0
3219 //   r2 := a1
3220 //   r3 := b1
3221 //
3222 // https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
3223 FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
3224 {
3225 #if defined(__aarch64__)
3226     return vreinterpretq_m128i_s32(vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3227 #else
3228     int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
3229     int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
3230     int32x2x2_t result = vzip_s32(a1, b1);
3231     return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
3232 #endif
3233 }
3234
3235 FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
3236 {
3237     int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
3238     int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
3239     return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
3240 }
3241
3242 // Selects and interleaves the lower two single-precision, floating-point values
3243 // from a and b.
3244 //
3245 //   r0 := a0
3246 //   r1 := b0
3247 //   r2 := a1
3248 //   r3 := b1
3249 //
3250 // https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
3251 FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
3252 {
3253 #if defined(__aarch64__)
3254     return vreinterpretq_m128_f32(vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3255 #else
3256     float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
3257     float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
3258     float32x2x2_t result = vzip_f32(a1, b1);
3259     return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
3260 #endif
3261 }
3262
3263 // Selects and interleaves the upper two single-precision, floating-point values
3264 // from a and b.
3265 //
3266 //   r0 := a2
3267 //   r1 := b2
3268 //   r2 := a3
3269 //   r3 := b3
3270 //
3271 // https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
3272 FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
3273 {
3274 #if defined(__aarch64__)
3275     return vreinterpretq_m128_f32(vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3276 #else
3277     float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
3278     float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
3279     float32x2x2_t result = vzip_f32(a1, b1);
3280     return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
3281 #endif
3282 }
3283
3284 // Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
3285 // 8 signed or unsigned 8-bit integers in b.
3286 //
3287 //   r0 := a8
3288 //   r1 := b8
3289 //   r2 := a9
3290 //   r3 := b9
3291 //   ...
3292 //   r14 := a15
3293 //   r15 := b15
3294 //
3295 // https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
3296 FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
3297 {
3298 #if defined(__aarch64__)
3299     return vreinterpretq_m128i_s8(vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3300 #else
3301     int8x8_t a1 =
3302         vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
3303     int8x8_t b1 =
3304         vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
3305     int8x8x2_t result = vzip_s8(a1, b1);
3306     return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
3307 #endif
3308 }
3309
3310 // Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
3311 // upper 4 signed or unsigned 16-bit integers in b.
3312 //
3313 //   r0 := a4
3314 //   r1 := b4
3315 //   r2 := a5
3316 //   r3 := b5
3317 //   r4 := a6
3318 //   r5 := b6
3319 //   r6 := a7
3320 //   r7 := b7
3321 //
3322 // https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
3323 FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
3324 {
3325 #if defined(__aarch64__)
3326     return vreinterpretq_m128i_s16(vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3327 #else
3328     int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
3329     int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
3330     int16x4x2_t result = vzip_s16(a1, b1);
3331     return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
3332 #endif
3333 }
3334
3335 // Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
3336 // upper 2 signed or unsigned 32-bit integers in b.
3337 // https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
3338 FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
3339 {
3340 #if defined(__aarch64__)
3341     return vreinterpretq_m128i_s32(vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3342 #else
3343     int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
3344     int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
3345     int32x2x2_t result = vzip_s32(a1, b1);
3346     return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
3347 #endif
3348 }
3349
3350 // Interleaves the upper signed or unsigned 64-bit integer in a with the
3351 // upper signed or unsigned 64-bit integer in b.
3352 //
3353 //   r0 := a1
3354 //   r1 := b1
3355 FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
3356 {
3357     int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
3358     int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
3359     return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
3360 }
3361
3362 // shift to right
3363 // https://msdn.microsoft.com/en-us/library/bb514041(v=vs.120).aspx
3364 // http://blog.csdn.net/hemmingway/article/details/44828303
3365 // Clang requires a macro here, as it is extremely picky about c being a literal.
3366 #define _mm_alignr_epi8(a, b, c) ((__m128i) vextq_s8((int8x16_t) (b), (int8x16_t) (a), (c)))
3367
3368 // Extracts the selected signed or unsigned 8-bit integer from a and zero
3369 // extends.
3370 // FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
3371 #define _mm_extract_epi8(a, imm) \
3372     vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
3373
3374 // Inserts the least significant 8 bits of b into the selected 8-bit integer
3375 // of a.
3376 // FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, const int b,
3377 // __constrange(0,16) int imm)
3378 #define _mm_insert_epi8(a, b, imm)                                  \
3379     __extension__({                                                  \
3380         vreinterpretq_m128i_s8(                                     \
3381             vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
3382     })
3383
3384 // Extracts the selected signed or unsigned 16-bit integer from a and zero
3385 // extends.
3386 // https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
3387 // FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
3388 #define _mm_extract_epi16(a, imm) \
3389     vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
3390
3391 // Inserts the least significant 16 bits of b into the selected 16-bit integer
3392 // of a.
3393 // https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
3394 // FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, const int b,
3395 // __constrange(0,8) int imm)
3396 #define _mm_insert_epi16(a, b, imm)                                  \
3397     __extension__({                                                  \
3398         vreinterpretq_m128i_s16(                                     \
3399             vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
3400     })
3401
3402 // Extracts the selected signed or unsigned 32-bit integer from a and zero
3403 // extends.
3404 // FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
3405 #define _mm_extract_epi32(a, imm) \
3406     vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
3407
3408 // Inserts the least significant 32 bits of b into the selected 32-bit integer
3409 // of a.
3410 // FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, const int b,
3411 // __constrange(0,4) int imm)
3412 #define _mm_insert_epi32(a, b, imm)                                  \
3413     __extension__({                                                  \
3414         vreinterpretq_m128i_s32(                                     \
3415             vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
3416     })
3417
3418
3419 // Extracts the selected signed or unsigned 64-bit integer from a and zero
3420 // extends.
3421 // FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
3422 #define _mm_extract_epi64(a, imm) \
3423     vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
3424
3425 // Inserts the least significant 64 bits of b into the selected 64-bit integer
3426 // of a.
3427 // FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, const __int64 b,
3428 // __constrange(0,2) int imm)
3429 #define _mm_insert_epi64(a, b, imm)                                  \
3430     __extension__({                                                  \
3431         vreinterpretq_m128i_s64(                                     \
3432             vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
3433     })
3434
3435 // ******************************************
3436 // Crypto Extensions
3437 // ******************************************
3438 #if defined(__ARM_FEATURE_CRYPTO)
3439 // Wraps vmull_p64
3440 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
3441 {
3442     poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
3443     poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
3444     return vreinterpretq_u64_p128(vmull_p64(a, b));
3445 }
3446
3447 #else // ARMv7 polyfill
3448 // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
3449 //
3450 // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
3451 // 64-bit->128-bit polynomial multiply.
3452 //
3453 // It needs some work and is somewhat slow, but it is still faster than all
3454 // known scalar methods.
3455 //
3456 // Algorithm adapted to C from https://www.workofard.com/2017/07/ghash-for-low-end-cores/,
3457 // which is adapted from "Fast Software Polynomial Multiplication on
3458 // ARM Processors Using the NEON Engine" by Danilo Camara, Conrado Gouvea,
3459 // Julio Lopez and Ricardo Dahab (https://hal.inria.fr/hal-01506572)
3460 static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
3461 {
3462     poly8x8_t a = vreinterpret_p8_u64(_a);
3463     poly8x8_t b = vreinterpret_p8_u64(_b);
3464
3465     // Masks
3466     uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff), vcreate_u8(0x00000000ffffffff));
3467     uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff), vcreate_u8(0x0000000000000000));
3468
3469     // Do the multiplies, rotating with vext to get all combinations
3470     uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));                // D = A0 * B0
3471     uint8x16_t e = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
3472     uint8x16_t f = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
3473     uint8x16_t g = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
3474     uint8x16_t h = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
3475     uint8x16_t i = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
3476     uint8x16_t j = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
3477     uint8x16_t k = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
3478
3479     // Add cross products
3480     uint8x16_t l = veorq_u8(e, f); // L = E + F
3481     uint8x16_t m = veorq_u8(g, h); // M = G + H
3482     uint8x16_t n = veorq_u8(i, j); // N = I + J
3483
3484     // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL instructions.
3485 #if defined(__aarch64__)
3486     uint8x16_t lm_p0 = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
3487     uint8x16_t lm_p1 = vreinterpretq_u8_u64(vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
3488     uint8x16_t nk_p0 = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
3489     uint8x16_t nk_p1 = vreinterpretq_u8_u64(vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
3490 #else
3491     uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l),  vget_low_u8(m));
3492     uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
3493     uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n),  vget_low_u8(k));
3494     uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
3495 #endif
3496     // t0 = (L) (P0 + P1) << 8
3497     // t1 = (M) (P2 + P3) << 16
3498     uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
3499     uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
3500     uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
3501
3502     // t2 = (N) (P4 + P5) << 24
3503     // t3 = (K) (P6 + P7) << 32
3504     uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
3505     uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
3506     uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
3507
3508     // De-interleave
3509 #if defined(__aarch64__)
3510     uint8x16_t t0 = vreinterpretq_u8_u64(vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
3511     uint8x16_t t1 = vreinterpretq_u8_u64(vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
3512     uint8x16_t t2 = vreinterpretq_u8_u64(vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
3513     uint8x16_t t3 = vreinterpretq_u8_u64(vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
3514 #else
3515     uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
3516     uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l),  vget_low_u8(t0t1_h));
3517     uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
3518     uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l),  vget_low_u8(t2t3_h));
3519 #endif
3520     // Shift the cross products
3521     uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
3522     uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
3523     uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
3524     uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
3525
3526     // Accumulate the products
3527     uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
3528     uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
3529     uint8x16_t mix    = veorq_u8(d, cross1);
3530     uint8x16_t r      = veorq_u8(mix, cross2);
3531     return vreinterpretq_u64_u8(r);
3532 }
3533
3534 #endif // ARMv7 polyfill
3535 FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
3536 {
3537     uint64x2_t a = vreinterpretq_u64_m128i(_a);
3538     uint64x2_t b = vreinterpretq_u64_m128i(_b);
3539     switch (imm & 0x11) {
3540         case 0x00: return vreinterpretq_m128i_u64(_sse2neon_vmull_p64(vget_low_u64(a),  vget_low_u64(b)));
3541         case 0x01: return vreinterpretq_m128i_u64(_sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
3542         case 0x10: return vreinterpretq_m128i_u64(_sse2neon_vmull_p64(vget_low_u64(a),  vget_high_u64(b)));
3543         case 0x11: return vreinterpretq_m128i_u64(_sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
3544         default: abort();
3545     }
3546 }
3547
3548 #if !defined(__ARM_FEATURE_CRYPTO) && defined(__aarch64__)
3549 // In the absence of crypto extensions, implement aesenc using regular neon
3550 // intrinsics instead. See:
3551 // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
3552 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
3553 // https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
3554 // for more information Reproduced with permission of the author.
3555 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
3556 {
3557     static const uint8_t crypto_aes_sbox[256] = {
3558         0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b,
3559         0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
3560         0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26,
3561         0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
3562         0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
3563         0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
3564         0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed,
3565         0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
3566         0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f,
3567         0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
3568         0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec,
3569         0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
3570         0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14,
3571         0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
3572         0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
3573         0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
3574         0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f,
3575         0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
3576         0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11,
3577         0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
3578         0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f,
3579         0xb0, 0x54, 0xbb, 0x16};
3580     static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
3581                                          0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
3582                                          0xc, 0x1, 0x6, 0xb};
3583     static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
3584                                        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
3585
3586     uint8x16_t v;
3587     uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
3588
3589     // shift rows
3590     w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
3591
3592     // sub bytes
3593     v = vqtbl4q_u8(vld1q_u8_x4(crypto_aes_sbox), w);
3594     v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x40), w - 0x40);
3595     v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x80), w - 0x80);
3596     v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0xc0), w - 0xc0);
3597
3598     // mix columns
3599     w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
3600     w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
3601     w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
3602
3603     //  add round key
3604     return vreinterpretq_m128i_u8(w) ^ RoundKey;
3605 }
3606 #elif defined(__ARM_FEATURE_CRYPTO)
3607 // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
3608 // AESMC and then manually applying the real key as an xor operation This
3609 // unfortunately means an additional xor op; the compiler should be able to
3610 // optimise this away for repeated calls however See
3611 // https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
3612 // for more details.
3613 inline __m128i _mm_aesenc_si128(__m128i a, __m128i b)
3614 {
3615     return vreinterpretq_m128i_u8(
3616         vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
3617         vreinterpretq_u8_m128i(b));
3618 }
3619 #endif
3620
3621 // ******************************************
3622 // Streaming Extensions
3623 // ******************************************
3624
3625 // Guarantees that every preceding store is globally visible before any
3626 // subsequent store.
3627 // https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
3628 FORCE_INLINE void _mm_sfence(void)
3629 {
3630     __sync_synchronize();
3631 }
3632
3633 // Stores the data in a to the address p without polluting the caches.  If the
3634 // cache line containing address p is already in the cache, the cache will be
3635 // updated.Address p must be 16 - byte aligned.
3636 // https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
3637 FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
3638 {
3639     vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
3640 }
3641
3642 // Cache line containing p is flushed and invalidated from all caches in the
3643 // coherency domain. :
3644 // https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
3645 FORCE_INLINE void _mm_clflush(void const *p)
3646 {
3647     (void)p;
3648     // no corollary for Neon?
3649 }
3650
3651 // Allocate aligned blocks of memory.
3652 // https://software.intel.com/en-us/
3653 //         cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
3654 FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
3655 {
3656     void *ptr;
3657     if (align == 1)
3658         return malloc(size);
3659     if (align == 2 || (sizeof(void *) == 8 && align == 4))
3660         align = sizeof(void *);
3661     if (!posix_memalign(&ptr, align, size))
3662         return ptr;
3663     return NULL;
3664 }
3665
3666 FORCE_INLINE void _mm_free(void *addr)
3667 {
3668     free(addr);
3669 }
3670
3671 #if defined(__GNUC__) || defined(__clang__)
3672 #pragma pop_macro("ALIGN_STRUCT")
3673 #pragma pop_macro("FORCE_INLINE")
3674 #endif
3675
3676 #endif