63#ifndef SSE2NEON_PRECISE_MINMAX
64#define SSE2NEON_PRECISE_MINMAX (0)
67#ifndef SSE2NEON_PRECISE_DIV
68#define SSE2NEON_PRECISE_DIV (0)
71#ifndef SSE2NEON_PRECISE_SQRT
72#define SSE2NEON_PRECISE_SQRT (0)
75#ifndef SSE2NEON_PRECISE_DP
76#define SSE2NEON_PRECISE_DP (0)
82#ifndef SSE2NEON_INCLUDE_WINDOWS_H
83#define SSE2NEON_INCLUDE_WINDOWS_H (0)
87#if defined(__GNUC__) || defined(__clang__)
88#pragma push_macro("FORCE_INLINE")
89#pragma push_macro("ALIGN_STRUCT")
90#define FORCE_INLINE static inline __attribute__((always_inline))
91#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
92#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
93#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
94#elif defined(_MSC_VER)
96#error Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead.
99#define FORCE_INLINE static inline
102#define ALIGN_STRUCT(x) __declspec(align(x))
104#define _sse2neon_likely(x) (x)
105#define _sse2neon_unlikely(x) (x)
107#pragma message("Macro name collisions may happen with unsupported compilers.")
110#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10
111#warning "GCC versions earlier than 10 are not supported."
114#if defined(__OPTIMIZE__) && !defined(SSE2NEON_SUPPRESS_WARNINGS)
116 "Report any potential compiler optimization issues when using SSE2NEON. See the 'Optimization' section at https://github.com/DLTcollab/sse2neon."
121#define _sse2neon_const static const
123#define _sse2neon_const const
134 memcpy(&tmp, &val,
sizeof(uint64_t));
140 memcpy(&tmp, &val,
sizeof(uint64_t));
144#if defined(_WIN32) && !defined(__MINGW32__)
146#define SSE2NEON_ALLOC_DEFINED
151#if defined(_M_ARM64EC)
152#define _DISABLE_SOFTINTRIN_ 1
155#if SSE2NEON_INCLUDE_WINDOWS_H
156#include <processthreadsapi.h>
160#if !defined(__cplusplus)
161#error SSE2NEON only supports C++ compilation with this compiler
164#ifdef SSE2NEON_ALLOC_DEFINED
168#if (defined(_M_AMD64) || defined(__x86_64__)) || \
169 (defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__arm64__))
170#define SSE2NEON_HAS_BITSCAN64
174#if defined(__GNUC__) || defined(__clang__)
175#define _sse2neon_define0(type, s, body) \
180#define _sse2neon_define1(type, s, body) \
185#define _sse2neon_define2(type, a, b, body) \
187 type _a = (a), _b = (b); \
190#define _sse2neon_return(ret) (ret)
192#define _sse2neon_define0(type, a, body) [=](type _a) { body }(a)
193#define _sse2neon_define1(type, a, body) [](type _a) { body }(a)
194#define _sse2neon_define2(type, a, b, body) \
195 [](type _a, type _b) { body }((a), (b))
196#define _sse2neon_return(ret) return ret
199#define _sse2neon_init(...) \
205#if defined(_MSC_VER) && !defined(__clang__)
206#define SSE2NEON_BARRIER() _ReadWriteBarrier()
208#define SSE2NEON_BARRIER() \
210 __asm__ __volatile__("" ::: "memory"); \
220#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
221#include <stdatomic.h>
227#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
228 !defined(__STDC_NO_ATOMICS__)
229 atomic_thread_fence(memory_order_seq_cst);
230#elif defined(__GNUC__) || defined(__clang__)
231 __atomic_thread_fence(__ATOMIC_SEQ_CST);
233 __dmb(_ARM64_BARRIER_ISH);
240#if defined(__arm__) && __ARM_ARCH == 7
245#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
246#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
248#if !defined(__clang__)
249#pragma GCC push_options
250#pragma GCC target("fpu=neon")
252#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
253#if !defined(__clang__) && !defined(_MSC_VER)
254#pragma GCC push_options
255#pragma GCC target("+simd")
258#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
260 "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
262#if !defined(__clang__) && !defined(_MSC_VER)
263#pragma GCC push_options
267 "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A \
268(you could try setting target explicitly with -march or -mcpu)"
273#if (!defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)) && \
275#if defined __has_include && __has_include(<arm_acle.h>)
285#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
286#define SSE2NEON_CACHELINE_SIZE 128
288#define SSE2NEON_CACHELINE_SIZE 64
292#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
301#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
310#if defined(__GNUC__) && (__GNUC__ <= 9)
311#define __has_builtin(x) HAS##x
312#define HAS__builtin_popcount 1
313#define HAS__builtin_popcountll 1
316#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
317#define HAS__builtin_shuffle 1
319#define HAS__builtin_shuffle 0
322#define HAS__builtin_shufflevector 0
323#define HAS__builtin_nontemporal_store 0
325#define __has_builtin(x) 0
337#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
338 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
347#define _MM_SHUFFLE2(fp1, fp0) (((fp1) << 1) | (fp0))
349#if __has_builtin(__builtin_shufflevector)
350#define _sse2neon_shuffle(type, a, b, ...) \
351 __builtin_shufflevector(a, b, __VA_ARGS__)
352#elif __has_builtin(__builtin_shuffle)
353#define _sse2neon_shuffle(type, a, b, ...) \
355 type tmp = {__VA_ARGS__}; \
356 __builtin_shuffle(a, b, tmp); \
360#ifdef _sse2neon_shuffle
361#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
362#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
363#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
364#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
365#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
366#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
370#define _MM_FROUND_TO_NEAREST_INT 0x00
371#define _MM_FROUND_TO_NEG_INF 0x01
372#define _MM_FROUND_TO_POS_INF 0x02
373#define _MM_FROUND_TO_ZERO 0x03
374#define _MM_FROUND_CUR_DIRECTION 0x04
375#define _MM_FROUND_NO_EXC 0x08
376#define _MM_FROUND_RAISE_EXC 0x00
377#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
378#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
379#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
380#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
381#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
382#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
383#define _MM_ROUND_NEAREST 0x0000
384#define _MM_ROUND_DOWN 0x2000
385#define _MM_ROUND_UP 0x4000
386#define _MM_ROUND_TOWARD_ZERO 0x6000
388#define _MM_FLUSH_ZERO_MASK 0x8000
389#define _MM_FLUSH_ZERO_ON 0x8000
390#define _MM_FLUSH_ZERO_OFF 0x0000
392#define _MM_DENORMALS_ZERO_MASK 0x0040
393#define _MM_DENORMALS_ZERO_ON 0x0040
394#define _MM_DENORMALS_ZERO_OFF 0x0000
397#define __constrange(a, b) const
410#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
424#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
425#if (defined(__x86_64__) || defined(__i386__))
426#define __int64 long long
428#define __int64 int64_t
434#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
435#define vreinterpretq_m128_f32(x) (x)
436#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
438#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
439#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
440#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
441#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
443#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
444#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
445#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
446#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
448#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
449#define vreinterpretq_f32_m128(x) (x)
450#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
452#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
453#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
454#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
455#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
457#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
458#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
459#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
460#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
462#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
463#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
464#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
465#define vreinterpretq_m128i_s64(x) (x)
467#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
468#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
469#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
470#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
472#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
473#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
475#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
476#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
477#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
478#define vreinterpretq_s64_m128i(x) (x)
480#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
481#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
482#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
483#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
485#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
486#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
487#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
488#define vreinterpret_m64_s64(x) (x)
490#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
491#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
492#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
493#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
495#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
496#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
497#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
499#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
500#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
501#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
502#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
504#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
505#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
506#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
507#define vreinterpret_s64_m64(x) (x)
509#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
511#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
512#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
513#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
515#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
517#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
518#define vreinterpretq_m128d_f64(x) (x)
520#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
522#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
523#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
525#define vreinterpretq_f64_m128d(x) (x)
526#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
528#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
529#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
531#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
532#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
534#define vreinterpretq_m128d_f32(x) (x)
536#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
538#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
539#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
541#define vreinterpretq_f32_m128d(x) (x)
574 uint16_t m128_u16[8];
575 uint32_t m128_u32[4];
576 uint64_t m128_u64[2];
580#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
581#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
582#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
585#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
586#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
587#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
588#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
622#if defined(__GNUC__) && !defined(__clang__) && \
623 ((__GNUC__ <= 13 && defined(__arm__)) || \
624 (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
625 (__GNUC__ <= 9 && defined(__aarch64__)))
629 ret.val[0] = vld1q_u8(p + 0);
630 ret.val[1] = vld1q_u8(p + 16);
631 ret.val[2] = vld1q_u8(p + 32);
632 ret.val[3] = vld1q_u8(p + 48);
639 return vld1q_u8_x4(p);
643#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
647 const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
648 return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
658#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
662 uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
664 for (
int i = 0; i < 8; ++i)
676#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
680 uint32x4_t m = vpaddlq_u16(a);
681 uint64x2_t n = vpaddlq_u32(m);
682 uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
684 return vget_lane_u32((uint32x2_t) o, 0);
690 return vaddvq_u16(a);
731#if defined(_M_ARM64EC)
753#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
779 float32x2_t a21 = vget_high_f32(
781 float32x2_t b03 = vget_low_f32(
788 float32x2_t a03 = vget_low_f32(
790 float32x2_t b21 = vget_high_f32(
853 float32x2_t a02 = vset_lane_f32(a0, a22, 1);
871 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
878 float32_t b2 = vgetq_lane_f32(b, 2);
880 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
887 float32_t b2 = vgetq_lane_f32(b, 2);
889 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
897#if ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__)) || \
898 (defined(__ARM_FEATURE_CRYPTO) && \
899 (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64)))
901FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
903 poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
904 poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
905#if defined(_MSC_VER) && !defined(__clang__)
906 __n64 a1 = {a}, b1 = {b};
907 return vreinterpretq_u64_p128(vmull_p64(a1, b1));
909 return vreinterpretq_u64_p128(vmull_p64(a, b));
926static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
928 poly8x8_t a = vreinterpret_p8_u64(_a);
929 poly8x8_t b = vreinterpret_p8_u64(_b);
932 uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
933 vcreate_u8(0x00000000ffffffff));
934 uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
935 vcreate_u8(0x0000000000000000));
938 uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));
940 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));
942 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));
944 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));
946 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));
948 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));
950 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));
952 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));
955 uint8x16_t l = veorq_u8(e, f);
956 uint8x16_t m = veorq_u8(g, h);
957 uint8x16_t n = veorq_u8(i, j);
961#if defined(__aarch64__)
962 uint8x16_t lm_p0 = vreinterpretq_u8_u64(
963 vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
964 uint8x16_t lm_p1 = vreinterpretq_u8_u64(
965 vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
966 uint8x16_t nk_p0 = vreinterpretq_u8_u64(
967 vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
968 uint8x16_t nk_p1 = vreinterpretq_u8_u64(
969 vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
971 uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
972 uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
973 uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
974 uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
978 uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
979 uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
980 uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
984 uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
985 uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
986 uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
989#if defined(__aarch64__)
990 uint8x16_t t0 = vreinterpretq_u8_u64(
991 vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
992 uint8x16_t t1 = vreinterpretq_u8_u64(
993 vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
994 uint8x16_t t2 = vreinterpretq_u8_u64(
995 vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
996 uint8x16_t t3 = vreinterpretq_u8_u64(
997 vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
999 uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
1000 uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
1001 uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
1002 uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
1005 uint8x16_t t0_shift = vextq_u8(t0, t0, 15);
1006 uint8x16_t t1_shift = vextq_u8(t1, t1, 14);
1007 uint8x16_t t2_shift = vextq_u8(t2, t2, 13);
1008 uint8x16_t t3_shift = vextq_u8(t3, t3, 12);
1011 uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
1012 uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
1013 uint8x16_t mix = veorq_u8(d, cross1);
1014 uint8x16_t r = veorq_u8(mix, cross2);
1015 return vreinterpretq_u64_u8(r);
1027#define _mm_shuffle_epi32_default(a, imm) \
1028 vreinterpretq_m128i_s32(vsetq_lane_s32( \
1029 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
1031 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
1032 vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), \
1033 ((imm) >> 2) & 0x3), \
1034 vmovq_n_s32(vgetq_lane_s32( \
1035 vreinterpretq_s32_m128i(a), (imm) & (0x3))), \
1122#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
1123#define _mm_shuffle_epi32_splat(a, imm) \
1124 vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm)))
1126#define _mm_shuffle_epi32_splat(a, imm) \
1127 vreinterpretq_m128i_s32( \
1128 vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))))
1145#define _mm_shuffle_ps_default(a, b, imm) \
1146 vreinterpretq_m128_f32(vsetq_lane_f32( \
1147 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
1149 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
1151 vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
1153 vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))), \
1162#define _mm_shufflelo_epi16_function(a, imm) \
1163 _sse2neon_define1( \
1164 __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a); \
1165 int16x4_t lowBits = vget_low_s16(ret); \
1166 ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
1167 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
1169 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
1171 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
1173 _sse2neon_return(vreinterpretq_m128i_s16(ret));)
1179#define _mm_shufflehi_epi16_function(a, imm) \
1180 _sse2neon_define1( \
1181 __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a); \
1182 int16x4_t highBits = vget_high_s16(ret); \
1183 ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
1184 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
1186 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
1188 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
1190 _sse2neon_return(vreinterpretq_m128i_s16(ret));)
1215 float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
1494 return vgetq_lane_u32(a_eq_b, 0) & 0x1;
1504 return vgetq_lane_u32(a_ge_b, 0) & 0x1;
1514 return vgetq_lane_u32(a_gt_b, 0) & 0x1;
1524 return vgetq_lane_u32(a_le_b, 0) & 0x1;
1534 return vgetq_lane_u32(a_lt_b, 0) & 0x1;
1562#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
1563 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1587#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
1588 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1592 float32_t data = vgetq_lane_f32(
1594 return (int32_t) data;
1653#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
1689#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
1711#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
1718#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
1719 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1722 float32_t data = vgetq_lane_f32(
1724 return (int64_t) data;
1748#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
1753#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
1771#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
1801#define _mm_extract_pi16(a, imm) \
1802 (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
1806#if !defined(SSE2NEON_ALLOC_DEFINED)
1810 _aligned_free(addr);
1820#if defined(_MSC_VER) && !defined(__clang__)
1821 value = _ReadStatusReg(ARM64_FPCR);
1823 __asm__ __volatile__(
"mrs %0, FPCR" :
"=r"(value));
1830#if defined(_MSC_VER) && !defined(__clang__)
1831 _WriteStatusReg(ARM64_FPCR, value);
1833 __asm__ __volatile__(
"msr FPCR, %0" ::
"r"(value));
1845#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
1852#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
1855 __asm__ __volatile__(
"vmrs %0, FPSCR" :
"=r"(r.value));
1867 switch (fegetround()) {
1887#define _mm_insert_pi16(a, b, imm) \
1888 vreinterpret_m64_s16(vset_lane_s16((b), vreinterpret_s16_m64(a), (imm)))
1908#define _mm_load_ps1 _mm_load1_ps
1934 vcombine_f32(vget_low_f32(a), vld1_f32((
const float32_t *) p)));
1944 vcombine_f32(vld1_f32((
const float32_t *) p), vget_high_f32(a)));
1953 float32x4_t v = vrev64q_f32(vld1q_f32(p));
1973 vsetq_lane_s16(*(
const unaligned_int16_t *) p, vdupq_n_s16(0), 0));
1981 vsetq_lane_s64(*(
const unaligned_int64_t *) p, vdupq_n_s64(0), 0));
1988#if !defined(SSE2NEON_ALLOC_DEFINED)
1992 return _aligned_malloc(size, align);
1996 return malloc(size);
1997 if (align == 2 || (
sizeof(
void *) == 8 && align == 4))
1998 align =
sizeof(
void *);
1999 if (!posix_memalign(&ptr, align, size))
2017 vst1_s8((int8_t *) mem_addr, masked);
2024#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
2042#if SSE2NEON_PRECISE_MINMAX
2069 float32_t value = vgetq_lane_f32(
_mm_max_ps(a, b), 0);
2090#if SSE2NEON_PRECISE_MINMAX
2117 float32_t value = vgetq_lane_f32(
_mm_min_ps(a, b), 0);
2139#if defined(aarch64__)
2166#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
2167 static const int8_t shift[8] = {0, 1, 2, 3, 4, 5, 6, 7};
2168 uint8x8_t tmp = vshr_n_u8(input, 7);
2169 return vaddv_u8(vshl_u8(tmp, vld1_s8(shift)));
2172 uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
2173 uint32x2_t paired16 =
2174 vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
2175 uint8x8_t paired32 =
2176 vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
2177 return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
2187#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
2188 static const int32_t shift[4] = {0, 1, 2, 3};
2189 uint32x4_t tmp = vshrq_n_u32(input, 31);
2190 return vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift)));
2195 uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2198 vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2200 return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2244#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2249#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2254#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
2259#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
2264#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
2269#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
2274#define _m_pminsw(a, b) _mm_min_pi16(a, b)
2279#define _m_pminub(a, b) _mm_min_pu8(a, b)
2284#define _m_pmovmskb(a) _mm_movemask_pi8(a)
2290#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2298#if defined(_MSC_VER) && !defined(__clang__)
2316 __builtin_prefetch(p, 0, 0);
2319 __builtin_prefetch(p, 0, 3);
2322 __builtin_prefetch(p, 0, 2);
2325 __builtin_prefetch(p, 0, 1);
2336#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
2341#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
2351#if SSE2NEON_PRECISE_DIV
2378 const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2379 const uint32x4_t neg_inf = vdupq_n_u32(0xFF800000);
2380 const uint32x4_t has_pos_zero =
2381 vceqq_u32(pos_inf, vreinterpretq_u32_f32(out));
2382 const uint32x4_t has_neg_zero =
2383 vceqq_u32(neg_inf, vreinterpretq_u32_f32(out));
2387#if SSE2NEON_PRECISE_SQRT
2395 out = vbslq_f32(has_pos_zero, (float32x4_t) pos_inf, out);
2396 out = vbslq_f32(has_neg_zero, (float32x4_t) neg_inf, out);
2408 return vsetq_lane_f32(vgetq_lane_f32(
_mm_rsqrt_ps(in), 0), in, 0);
2418 uint64x1_t t = vpaddl_u32(vpaddl_u16(
2421 vset_lane_u16((uint16_t) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
2434#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
2441#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
2444 __asm__ __volatile__(
"vmrs %0, FPSCR" :
"=r"(r.value));
2449#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
2452 __asm__ __volatile__(
"vmsr FPSCR, %0" ::
"r"(r));
2482 rounding = FE_TONEAREST;
2485 rounding = FE_DOWNWARD;
2488 rounding = FE_UPWARD;
2491 rounding = FE_TOWARDZERO;
2497 rounding = FE_TOWARDZERO;
2499 fesetround(rounding);
2554#ifdef _sse2neon_shuffle
2555#define _mm_shuffle_pi16(a, imm) \
2556 vreinterpret_m64_s16(vshuffle_s16( \
2557 vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), ((imm) & 0x3), \
2558 (((imm) >> 2) & 0x3), (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3)))
2560#define _mm_shuffle_pi16(a, imm) \
2561 _sse2neon_define1( \
2562 __m64, a, int16x4_t ret; \
2564 vget_lane_s16(vreinterpret_s16_m64(_a), (imm) & (0x3))); \
2565 ret = vset_lane_s16( \
2566 vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 2) & 0x3), ret, \
2568 ret = vset_lane_s16( \
2569 vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 4) & 0x3), ret, \
2571 ret = vset_lane_s16( \
2572 vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 6) & 0x3), ret, \
2574 _sse2neon_return(vreinterpret_m64_s16(ret));)
2610#ifdef _sse2neon_shuffle
2611#define _mm_shuffle_ps(a, b, imm) \
2613 float32x4_t _input1 = vreinterpretq_f32_m128(a); \
2614 float32x4_t _input2 = vreinterpretq_f32_m128(b); \
2615 float32x4_t _shuf = \
2616 vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
2617 (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
2618 vreinterpretq_m128_f32(_shuf); \
2621#define _mm_shuffle_ps(a, b, imm) \
2622 _sse2neon_define2( \
2623 __m128, a, b, __m128 ret; switch (imm) { \
2624 case _MM_SHUFFLE(1, 0, 3, 2): \
2625 ret = _mm_shuffle_ps_1032(_a, _b); \
2627 case _MM_SHUFFLE(2, 3, 0, 1): \
2628 ret = _mm_shuffle_ps_2301(_a, _b); \
2630 case _MM_SHUFFLE(0, 3, 2, 1): \
2631 ret = _mm_shuffle_ps_0321(_a, _b); \
2633 case _MM_SHUFFLE(2, 1, 0, 3): \
2634 ret = _mm_shuffle_ps_2103(_a, _b); \
2636 case _MM_SHUFFLE(1, 0, 1, 0): \
2637 ret = _mm_movelh_ps(_a, _b); \
2639 case _MM_SHUFFLE(1, 0, 0, 1): \
2640 ret = _mm_shuffle_ps_1001(_a, _b); \
2642 case _MM_SHUFFLE(0, 1, 0, 1): \
2643 ret = _mm_shuffle_ps_0101(_a, _b); \
2645 case _MM_SHUFFLE(3, 2, 1, 0): \
2646 ret = _mm_shuffle_ps_3210(_a, _b); \
2648 case _MM_SHUFFLE(0, 0, 1, 1): \
2649 ret = _mm_shuffle_ps_0011(_a, _b); \
2651 case _MM_SHUFFLE(0, 0, 2, 2): \
2652 ret = _mm_shuffle_ps_0022(_a, _b); \
2654 case _MM_SHUFFLE(2, 2, 0, 0): \
2655 ret = _mm_shuffle_ps_2200(_a, _b); \
2657 case _MM_SHUFFLE(3, 2, 0, 2): \
2658 ret = _mm_shuffle_ps_3202(_a, _b); \
2660 case _MM_SHUFFLE(3, 2, 3, 2): \
2661 ret = _mm_movehl_ps(_b, _a); \
2663 case _MM_SHUFFLE(1, 1, 3, 3): \
2664 ret = _mm_shuffle_ps_1133(_a, _b); \
2666 case _MM_SHUFFLE(2, 0, 1, 0): \
2667 ret = _mm_shuffle_ps_2010(_a, _b); \
2669 case _MM_SHUFFLE(2, 0, 0, 1): \
2670 ret = _mm_shuffle_ps_2001(_a, _b); \
2672 case _MM_SHUFFLE(2, 0, 3, 2): \
2673 ret = _mm_shuffle_ps_2032(_a, _b); \
2676 ret = _mm_shuffle_ps_default(_a, _b, (imm)); \
2678 } _sse2neon_return(ret);)
2689#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) && \
2690 !SSE2NEON_PRECISE_SQRT
2697 const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2698 const uint32x4_t div_by_zero =
2699 vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
2700 recip = vreinterpretq_f32_u32(
2701 vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
2744 vst1q_f32(p, vdupq_n_f32(a0));
2759#define _mm_store1_ps _mm_store_ps1
2784 float32x4_t rev = vextq_f32(tmp, tmp, 2);
2824#if __has_builtin(__builtin_nontemporal_store)
2825 __builtin_nontemporal_store(a, (float32x4_t *) p);
2855#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2857 float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
2858 float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
2859 row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
2860 vget_low_f32(ROW23.val[0])); \
2861 row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
2862 vget_low_f32(ROW23.val[1])); \
2863 row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
2864 vget_high_f32(ROW23.val[0])); \
2865 row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
2866 vget_high_f32(ROW23.val[1])); \
2871#define _mm_ucomieq_ss _mm_comieq_ss
2872#define _mm_ucomige_ss _mm_comige_ss
2873#define _mm_ucomigt_ss _mm_comigt_ss
2874#define _mm_ucomile_ss _mm_comile_ss
2875#define _mm_ucomilt_ss _mm_comilt_ss
2876#define _mm_ucomineq_ss _mm_comineq_ss
2882#if defined(__GNUC__) || defined(__clang__)
2883#pragma GCC diagnostic push
2884#pragma GCC diagnostic ignored "-Wuninitialized"
2887#if defined(_MSC_VER)
2891#if defined(__GNUC__) || defined(__clang__)
2892#pragma GCC diagnostic pop
2900#if defined(__GNUC__) || defined(__clang__)
2901#pragma GCC diagnostic push
2902#pragma GCC diagnostic ignored "-Wuninitialized"
2905#if defined(_MSC_VER)
2909#if defined(__GNUC__) || defined(__clang__)
2910#pragma GCC diagnostic pop
2919#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
2925 float32x2x2_t result = vzip_f32(a1, b1);
2935#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
2941 float32x2x2_t result = vzip_f32(a1, b1);
2994#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
2995 return vreinterpretq_m128d_f64(
2996 vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3009 return vld1q_f32((float32_t *) c);
3019#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3029 return vld1q_f32((float32_t *) c);
3136#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
3141#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
3180#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3198#if defined(__APPLE__)
3199#include <libkern/OSCacheControl.h>
3209#if defined(__APPLE__)
3211#elif defined(__GNUC__) || defined(__clang__)
3212 uintptr_t ptr = (uintptr_t) p;
3213 __builtin___clear_cache((
char *) ptr,
3215#elif (_MSC_VER) && SSE2NEON_INCLUDE_WINDOWS_H
3252#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3254 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3259 uint32x4_t swapped = vrev64q_u32(cmp);
3278#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3280 vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3291 d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0);
3292 d[1] = a1 >= b1 ? ~UINT64_C(0) : UINT64_C(0);
3304#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3313 d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0);
3352#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3354 vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3365 d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0);
3366 d[1] = a1 > b1 ? ~UINT64_C(0) : UINT64_C(0);
3378#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3387 d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0);
3399#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3401 vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3412 d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0);
3413 d[1] = a1 <= b1 ? ~UINT64_C(0) : UINT64_C(0);
3425#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3434 d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0);
3476#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3478 vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3489 d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0);
3490 d[1] = a1 < b1 ? ~UINT64_C(0) : UINT64_C(0);
3502#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3510 d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0);
3522#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3524 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
3529 uint32x4_t swapped = vrev64q_u32(cmp);
3548#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3550 vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3551 vdupq_n_u64(UINT64_MAX)));
3562 d[0] = !(a0 >= b0) ? ~UINT64_C(0) : UINT64_C(0);
3563 d[1] = !(a1 >= b1) ? ~UINT64_C(0) : UINT64_C(0);
3583#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3585 vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3586 vdupq_n_u64(UINT64_MAX)));
3597 d[0] = !(a0 > b0) ? ~UINT64_C(0) : UINT64_C(0);
3598 d[1] = !(a1 > b1) ? ~UINT64_C(0) : UINT64_C(0);
3618#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3620 vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3621 vdupq_n_u64(UINT64_MAX)));
3632 d[0] = !(a0 <= b0) ? ~UINT64_C(0) : UINT64_C(0);
3633 d[1] = !(a1 <= b1) ? ~UINT64_C(0) : UINT64_C(0);
3653#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3655 vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3656 vdupq_n_u64(UINT64_MAX)));
3667 d[0] = !(a0 < b0) ? ~UINT64_C(0) : UINT64_C(0);
3668 d[1] = !(a1 < b1) ? ~UINT64_C(0) : UINT64_C(0);
3688#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3690 uint64x2_t not_nan_a =
3691 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3692 uint64x2_t not_nan_b =
3693 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3705 d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0);
3706 d[1] = (a1 == a1 && b1 == b1) ? ~UINT64_C(0) : UINT64_C(0);
3718#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3726 d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0);
3738#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3740 uint64x2_t not_nan_a =
3741 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3742 uint64x2_t not_nan_b =
3743 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3745 vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
3756 d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0);
3757 d[1] = (a1 == a1 && b1 == b1) ? UINT64_C(0) : ~UINT64_C(0);
3769#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3777 d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0);
3789#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3790 return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
3804#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3805 return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
3820#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3821 return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
3836#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3837 return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
3852#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3853 return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
3855 uint32x4_t a_not_nan =
3857 uint32x4_t b_not_nan =
3859 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
3862 uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
3863 vreinterpretq_u64_u32(a_eq_b));
3864 return vgetq_lane_u64(and_results, 0) & 0x1;
3881#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3882 return vreinterpretq_m128d_f64(
3905#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
3906 float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
3907 int64x2_t integers = vcvtq_s64_f64(rounded);
3909 vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
3932 int32_t
ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
3942#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3943 float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
3949 return _mm_set_ps(0, 0, (
float) a1, (
float) a0);
3958#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3959 return vreinterpretq_m128d_f64(
3975#if defined(__ARM_FEATURE_FRINT)
3977#elif (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
3978 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
3990 float *f = (
float *) &a;
3993 uint32x4_t signmask = vdupq_n_u32(0x80000000);
3996 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
3998 int32x4_t r_trunc = vcvtq_s32_f32(
4000 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
4001 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31));
4002 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
4004 float32x4_t delta = vsubq_f32(
4006 vcvtq_f32_s32(r_trunc));
4007 uint32x4_t is_delta_half =
4008 vceqq_f32(delta, half);
4010 vbslq_s32(is_delta_half, r_even, r_normal));
4013 return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
4019 return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
4031#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4032 return vreinterpretq_m128d_f64(
4045#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4046 return (
double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
4059#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4060 return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
4065 return (int32_t) ret;
4074#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4075 return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
4080 return (int64_t) ret;
4087#define _mm_cvtsd_si64x _mm_cvtsd_si64
4096#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4098 vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
4124#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4132#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4133 return vreinterpretq_m128d_f64(
4134 vsetq_lane_f64((
double) b, vreinterpretq_f64_m128d(a), 0));
4144#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4160#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4161 return vreinterpretq_m128d_f64(
4162 vsetq_lane_f64((
double) b, vreinterpretq_f64_m128d(a), 0));
4181#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
4187#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
4197#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4198 return vreinterpretq_m128d_f64(
4199 vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
4225 int32_t
ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
4244 return (int32_t) _a;
4252#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4253 return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
4257 return (int64_t) _a;
4264#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
4271#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4272 return vreinterpretq_m128d_f64(
4273 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4286 return vld1q_f32((float32_t *) c);
4297#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4299 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
4300 return vreinterpretq_m128d_f64(
4301 vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
4311#define _mm_extract_epi16(a, imm) \
4312 vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
4319#define _mm_insert_epi16(a, b, imm) \
4320 vreinterpretq_m128i_s16( \
4321 vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm)))
4329#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4330 return vreinterpretq_m128d_f64(vld1q_f64(p));
4332 const float *fp = (
const float *) p;
4333 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
4341#define _mm_load_pd1 _mm_load1_pd
4349#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4350 return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
4352 const float *fp = (
const float *) p;
4371#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4372 return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4384#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4385 return vreinterpretq_m128d_f64(
4386 vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4401 vcombine_s32(vld1_s32((int32_t
const *) p), vcreate_s32(0)));
4410#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4411 return vreinterpretq_m128d_f64(
4412 vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
4415 vcombine_f32(vld1_f32((
const float *) p),
4426#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4427 float64x2_t v = vld1q_f64(p);
4428 return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
4430 int64x2_t v = vld1q_s64((
const int64_t *) p);
4455 vsetq_lane_s32(*(
const unaligned_int32_t *) p, vdupq_n_s32(0), 0));
4466#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4475 int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
4476 int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
4494 vst1q_s8((int8_t *) mem_addr, masked);
4520#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4521#if SSE2NEON_PRECISE_MINMAX
4522 float64x2_t _a = vreinterpretq_f64_m128d(a);
4523 float64x2_t _b = vreinterpretq_f64_m128d(b);
4524 return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
4526 return vreinterpretq_m128d_f64(
4527 vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4552#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4559 double c[2] = {a0 > b0 ? a0 : b0, a1};
4587#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4588#if SSE2NEON_PRECISE_MINMAX
4589 float64x2_t _a = vreinterpretq_f64_m128d(a);
4590 float64x2_t _b = vreinterpretq_f64_m128d(b);
4591 return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
4593 return vreinterpretq_m128d_f64(
4594 vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4618#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4625 double c[2] = {a0 < b0 ? a0 : b0, a1};
4679 uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
4694 uint32x4_t paired16 =
4695 vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
4708 uint64x2_t paired32 =
4709 vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
4722 uint8x16_t paired64 =
4723 vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
4730 return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
4739 uint64x2_t high_bits = vshrq_n_u64(input, 63);
4740 return (
int) (vgetq_lane_u64(high_bits, 0) |
4741 (vgetq_lane_u64(high_bits, 1) << 1));
4776#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4777 return vreinterpretq_m128d_f64(
4778 vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4791 return vld1q_f32((float32_t *) c);
4825 int32x4_t ab3210 = vmull_s16(a3210, b3210);
4828 int32x4_t ab7654 = vmull_s16(a7654, b7654);
4830 vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4842 uint32x4_t ab3210 = vmull_u16(a3210, b3210);
4843#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4846 uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
4847 vreinterpretq_u16_u32(ab7654));
4852 uint32x4_t ab7654 = vmull_u16(a7654, b7654);
4854 vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4924#if defined(_MSC_VER) && !defined(__clang__)
4925 __isb(_ARM64_BARRIER_SY);
4927 __asm__ __volatile__(
"isb\n");
4938 uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
4953 int16_t
ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
4969 return _mm_set_epi64x(vget_lane_s64(i1, 0), vget_lane_s64(i2, 0));
4977 vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
5000 (int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
5001 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
5002 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
5003 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5004 return (
__m128i) vld1q_s8(data);
5013#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5014 return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
5023#define _mm_set_pd1 _mm_set1_pd
5030#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5031 return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
5077#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5078 return vreinterpretq_m128d_f64(vdupq_n_f64(d));
5096 int16_t
ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
5135 (int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
5136 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
5137 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
5138 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5139 return (
__m128i) vld1q_s8(data);
5154#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5155 return vreinterpretq_m128d_f64(vdupq_n_f64(0));
5173#if defined(_sse2neon_shuffle)
5174#define _mm_shuffle_epi32(a, imm) \
5176 int32x4_t _input = vreinterpretq_s32_m128i(a); \
5178 vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
5179 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
5180 vreinterpretq_m128i_s32(_shuf); \
5183#define _mm_shuffle_epi32(a, imm) \
5184 _sse2neon_define1( \
5185 __m128i, a, __m128i ret; switch (imm) { \
5186 case _MM_SHUFFLE(1, 0, 3, 2): \
5187 ret = _mm_shuffle_epi_1032(_a); \
5189 case _MM_SHUFFLE(2, 3, 0, 1): \
5190 ret = _mm_shuffle_epi_2301(_a); \
5192 case _MM_SHUFFLE(0, 3, 2, 1): \
5193 ret = _mm_shuffle_epi_0321(_a); \
5195 case _MM_SHUFFLE(2, 1, 0, 3): \
5196 ret = _mm_shuffle_epi_2103(_a); \
5198 case _MM_SHUFFLE(1, 0, 1, 0): \
5199 ret = _mm_shuffle_epi_1010(_a); \
5201 case _MM_SHUFFLE(1, 0, 0, 1): \
5202 ret = _mm_shuffle_epi_1001(_a); \
5204 case _MM_SHUFFLE(0, 1, 0, 1): \
5205 ret = _mm_shuffle_epi_0101(_a); \
5207 case _MM_SHUFFLE(2, 2, 1, 1): \
5208 ret = _mm_shuffle_epi_2211(_a); \
5210 case _MM_SHUFFLE(0, 1, 2, 2): \
5211 ret = _mm_shuffle_epi_0122(_a); \
5213 case _MM_SHUFFLE(3, 3, 3, 2): \
5214 ret = _mm_shuffle_epi_3332(_a); \
5216 case _MM_SHUFFLE(0, 0, 0, 0): \
5217 ret = _mm_shuffle_epi32_splat(_a, 0); \
5219 case _MM_SHUFFLE(1, 1, 1, 1): \
5220 ret = _mm_shuffle_epi32_splat(_a, 1); \
5222 case _MM_SHUFFLE(2, 2, 2, 2): \
5223 ret = _mm_shuffle_epi32_splat(_a, 2); \
5225 case _MM_SHUFFLE(3, 3, 3, 3): \
5226 ret = _mm_shuffle_epi32_splat(_a, 3); \
5229 ret = _mm_shuffle_epi32_default(_a, (imm)); \
5231 } _sse2neon_return(ret);)
5237#ifdef _sse2neon_shuffle
5238#define _mm_shuffle_pd(a, b, imm8) \
5239 vreinterpretq_m128d_s64( \
5240 vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
5241 (imm8) & 0x1, (((imm8) & 0x2) >> 1) + 2))
5243#define _mm_shuffle_pd(a, b, imm8) \
5244 _mm_castsi128_pd(_mm_set_epi64x( \
5245 vgetq_lane_s64(vreinterpretq_s64_m128d(b), ((imm8) & 0x2) >> 1), \
5246 vgetq_lane_s64(vreinterpretq_s64_m128d(a), (imm8) & 0x1)))
5251#if defined(_sse2neon_shuffle)
5252#define _mm_shufflehi_epi16(a, imm) \
5254 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5256 vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \
5257 (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
5258 (((imm) >> 6) & 0x3) + 4); \
5259 vreinterpretq_m128i_s16(_shuf); \
5262#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
5267#if defined(_sse2neon_shuffle)
5268#define _mm_shufflelo_epi16(a, imm) \
5270 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5271 int16x8_t _shuf = vshuffleq_s16( \
5272 _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
5273 (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
5274 vreinterpretq_m128i_s16(_shuf); \
5277#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
5286 if (_sse2neon_unlikely(c & ~15))
5289 int16x8_t vc = vdupq_n_s16((int16_t) c);
5299 if (_sse2neon_unlikely(c & ~31))
5302 int32x4_t vc = vdupq_n_s32((int32_t) c);
5312 if (_sse2neon_unlikely(c & ~63))
5315 int64x2_t vc = vdupq_n_s64((int64_t) c);
5324 if (_sse2neon_unlikely(imm & ~15))
5335 if (_sse2neon_unlikely(imm & ~31))
5346 if (_sse2neon_unlikely(imm & ~63))
5355#define _mm_slli_si128(a, imm) \
5356 _sse2neon_define1( \
5357 __m128i, a, int8x16_t ret; \
5358 if (_sse2neon_unlikely((imm) == 0)) ret = vreinterpretq_s8_m128i(_a); \
5359 else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \
5360 else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a), \
5361 (((imm) <= 0 || (imm) > 15) ? 0 : (16 - (imm)))); \
5362 _sse2neon_return(vreinterpretq_m128i_s8(ret));)
5369#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5370 return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
5375 double _a0 = sqrt(a0);
5376 double _a1 = sqrt(a1);
5387#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5402 int64_t c = vgetq_lane_s64(count, 0);
5403 if (_sse2neon_unlikely(c & ~15))
5406 vshlq_s16((int16x8_t) a, vdupq_n_s16((int16_t) -c)));
5414 int64_t c = vgetq_lane_s64(count, 0);
5415 if (_sse2neon_unlikely(c & ~31))
5418 vshlq_s32((int32x4_t) a, vdupq_n_s32((
int) -c)));
5426 const int16_t count = (imm & ~15) ? 15 : (int16_t) imm;
5427 return (
__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
5434#define _mm_srai_epi32(a, imm) \
5435 _sse2neon_define0( \
5436 __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) == 0)) { \
5438 } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \
5439 ret = vreinterpretq_m128i_s32( \
5440 vshlq_s32(vreinterpretq_s32_m128i(_a), vdupq_n_s32(-(imm)))); \
5442 ret = vreinterpretq_m128i_s32( \
5443 vshrq_n_s32(vreinterpretq_s32_m128i(_a), 31)); \
5444 } _sse2neon_return(ret);)
5452 if (_sse2neon_unlikely(c & ~15))
5455 int16x8_t vc = vdupq_n_s16(-(int16_t) c);
5465 if (_sse2neon_unlikely(c & ~31))
5468 int32x4_t vc = vdupq_n_s32(-(int32_t) c);
5478 if (_sse2neon_unlikely(c & ~63))
5481 int64x2_t vc = vdupq_n_s64(-(int64_t) c);
5488#define _mm_srli_epi16(a, imm) \
5489 _sse2neon_define0( \
5490 __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) { \
5491 ret = _mm_setzero_si128(); \
5493 ret = vreinterpretq_m128i_u16(vshlq_u16( \
5494 vreinterpretq_u16_m128i(_a), vdupq_n_s16((int16_t) - (imm)))); \
5495 } _sse2neon_return(ret);)
5501#define _mm_srli_epi32(a, imm) \
5502 _sse2neon_define0( \
5503 __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~31)) { \
5504 ret = _mm_setzero_si128(); \
5506 ret = vreinterpretq_m128i_u32( \
5507 vshlq_u32(vreinterpretq_u32_m128i(_a), vdupq_n_s32(-(imm)))); \
5508 } _sse2neon_return(ret);)
5513#define _mm_srli_epi64(a, imm) \
5514 _sse2neon_define0( \
5515 __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~63)) { \
5516 ret = _mm_setzero_si128(); \
5518 ret = vreinterpretq_m128i_u64( \
5519 vshlq_u64(vreinterpretq_u64_m128i(_a), vdupq_n_s64(-(imm)))); \
5520 } _sse2neon_return(ret);)
5525#define _mm_srli_si128(a, imm) \
5526 _sse2neon_define1( \
5527 __m128i, a, int8x16_t ret; \
5528 if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \
5529 else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \
5530 ((imm) > 15 ? 0 : (imm))); \
5531 _sse2neon_return(vreinterpretq_m128i_s8(ret));)
5539#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5540 vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
5552#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5553 float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
5554 vst1q_f64((float64_t *) mem_addr,
5555 vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
5558 vst1q_f32((float32_t *) mem_addr,
5568#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5569 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5587#define _mm_store1_pd _mm_store_pd1
5594#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5595 vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
5613#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5614 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5662#if __has_builtin(__builtin_nontemporal_store)
5663 __builtin_nontemporal_store(a, (
__m128d *) p);
5664#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5665 vst1q_f64(p, vreinterpretq_f64_m128d(a));
5677#if __has_builtin(__builtin_nontemporal_store)
5678 __builtin_nontemporal_store(a, p);
5690 vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
5699 vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
5744#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5745 return vreinterpretq_m128d_f64(
5746 vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5759 return vld1q_f32((float32_t *) c);
5817#define _mm_ucomieq_sd _mm_comieq_sd
5818#define _mm_ucomige_sd _mm_comige_sd
5819#define _mm_ucomigt_sd _mm_comigt_sd
5820#define _mm_ucomile_sd _mm_comile_sd
5821#define _mm_ucomilt_sd _mm_comilt_sd
5822#define _mm_ucomineq_sd _mm_comineq_sd
5828#if defined(__GNUC__) || defined(__clang__)
5829#pragma GCC diagnostic push
5830#pragma GCC diagnostic ignored "-Wuninitialized"
5833#if defined(_MSC_VER) && !defined(__clang__)
5837#if defined(__GNUC__) || defined(__clang__)
5838#pragma GCC diagnostic pop
5847#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5853 int16x4x2_t result = vzip_s16(a1, b1);
5863#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5869 int32x2x2_t result = vzip_s32(a1, b1);
5879#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5894#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5902 int8x8x2_t result = vzip_s8(a1, b1);
5912#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5913 return vreinterpretq_m128d_f64(
5914 vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5927#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5933 int16x4x2_t result = vzip_s16(a1, b1);
5943#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5949 int32x2x2_t result = vzip_s32(a1, b1);
5959#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5974#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5980 int8x8x2_t result = vzip_s8(a1, b1);
5990#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5991 return vreinterpretq_m128d_f64(
5992 vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6027#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6028 return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
6029 vreinterpretq_f64_m128d(b),
6030 vreinterpretq_f64_m128d(mask)));
6043#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
6044 defined(__ARM_FEATURE_FMA)
6058#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6059 return vreinterpretq_m128d_f64(
6060 vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6070 double c[] = {a0 + a1, b0 + b1};
6080#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6089 vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
6098#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6099 float64x2_t _a = vreinterpretq_f64_m128d(a);
6100 float64x2_t _b = vreinterpretq_f64_m128d(b);
6101 return vreinterpretq_m128d_f64(
6102 vsubq_f64(vuzp1q_f64(_a, _b), vuzp2q_f64(_a, _b)));
6112 double c[] = {a0 - a1, b0 - b1};
6124#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6126 vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
6128 float32x4x2_t c = vuzpq_f32(a, b);
6137#define _mm_lddqu_si128 _mm_loadu_si128
6142#define _mm_loaddup_pd _mm_load1_pd
6149#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6150 return vreinterpretq_m128d_f64(
6151 vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
6163#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6166#elif defined(_sse2neon_shuffle)
6182#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6185#elif defined(_sse2neon_shuffle)
6249#if defined(__GNUC__) && !defined(__clang__)
6250#define _mm_alignr_epi8(a, b, imm) \
6252 uint8x16_t _a = vreinterpretq_u8_m128i(a); \
6253 uint8x16_t _b = vreinterpretq_u8_m128i(b); \
6255 if (_sse2neon_unlikely((imm) & ~31)) \
6256 ret = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
6257 else if ((imm) >= 16) \
6258 ret = _mm_srli_si128(a, (imm) >= 16 ? (imm) - 16 : 0); \
6260 ret = vreinterpretq_m128i_u8( \
6261 vextq_u8(_b, _a, (imm) < 16 ? (imm) : 0)); \
6266#define _mm_alignr_epi8(a, b, imm) \
6267 _sse2neon_define2( \
6268 __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a); \
6269 uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret; \
6270 if (_sse2neon_unlikely((imm) & ~31)) ret = \
6271 vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
6272 else if ((imm) >= 16) ret = \
6273 _mm_srli_si128(_a, (imm) >= 16 ? (imm) - 16 : 0); \
6274 else ret = vreinterpretq_m128i_u8( \
6275 vextq_u8(__b, __a, (imm) < 16 ? (imm) : 0)); \
6276 _sse2neon_return(ret);)
6283#define _mm_alignr_pi8(a, b, imm) \
6284 _sse2neon_define2( \
6285 __m64, a, b, __m64 ret; if (_sse2neon_unlikely((imm) >= 16)) { \
6286 ret = vreinterpret_m64_s8(vdup_n_s8(0)); \
6288 uint8x8_t tmp_low; \
6289 uint8x8_t tmp_high; \
6291 const int idx = (imm) - 8; \
6292 tmp_low = vreinterpret_u8_m64(_a); \
6293 tmp_high = vdup_n_u8(0); \
6294 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6296 const int idx = (imm); \
6297 tmp_low = vreinterpret_u8_m64(_b); \
6298 tmp_high = vreinterpret_u8_m64(_a); \
6299 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6301 } _sse2neon_return(ret);)
6310#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6314 vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
6315 vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
6326#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6330 vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
6331 vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
6358#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6361 return vreinterpretq_s64_s16(
6362 vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6369 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6370 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6383#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6384 return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6386 int16x4x2_t res = vuzp_s16(a, b);
6387 return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
6398#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6400 vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6402 int16x8x2_t c = vuzpq_s16(a, b);
6414#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6416 vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
6418 int32x4x2_t c = vuzpq_s32(a, b);
6430#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6433 int16x4x2_t c = vuzp_s16(a, b);
6445#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6448 int32x2x2_t c = vuzp_s32(a, b);
6460#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6462 vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6464 int16x8x2_t c = vuzpq_s16(a, b);
6476#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6479 int16x4x2_t c = vuzp_s16(a, b);
6491#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6494 int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
6495 vmovl_s8(vget_low_s8(b)));
6496 int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
6497 vmovl_s8(vget_high_s8(b)));
6499 vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
6507 int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
6508 int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
6511 int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
6512 int16x8_t b_odd = vshrq_n_s16(b, 8);
6515 int16x8_t prod1 = vmulq_s16(a_even, b_even);
6516 int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
6534 int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
6535 int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
6538 int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
6539 int16x4_t b_odd = vshr_n_s16(b, 8);
6542 int16x4_t prod1 = vmul_s16(a_even, b_even);
6543 int16x4_t prod2 = vmul_s16(a_odd, b_odd);
6566 int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
6567 int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
6579 int32x4_t mul_extend =
6593 uint8x16_t idx_masked =
6594 vandq_u8(idx, vdupq_n_u8(0x8F));
6595#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6597#elif defined(__GNUC__)
6601 __asm__ __volatile__(
6602 "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
6603 "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
6605 : [tbl]
"w"(tbl), [idx]
"w"(idx_masked));
6609 int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
6611 vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
6612 vtbl2_s8(a_split, vget_high_u8(idx_masked))));
6621 const int8x8_t controlMask =
6639 uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
6641#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6642 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
6644 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
6649 int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
6651 int16x8_t res = vbicq_s16(masked, zeroMask);
6667 uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
6670#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6671 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
6673 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
6678 int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
6680 int32x4_t res = vbicq_s32(masked, zeroMask);
6696 uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
6699#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6700 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
6702 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
6707 int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
6709 int8x16_t res = vbicq_s8(masked, zeroMask);
6725 uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
6728#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6729 int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
6731 int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
6736 int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
6738 int16x4_t res = vbic_s16(masked, zeroMask);
6754 uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
6757#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6758 int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
6760 int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
6765 int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
6767 int32x2_t res = vbic_s32(masked, zeroMask);
6783 uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
6786#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6787 int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
6789 int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
6794 int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
6796 int8x8_t res = vbic_s8(masked, zeroMask);
6808#define _mm_blend_epi16(a, b, imm) \
6809 _sse2neon_define2( \
6811 const uint16_t _mask[8] = \
6812 _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) - 1 : 0x0, \
6813 ((imm) & (1 << 1)) ? (uint16_t) - 1 : 0x0, \
6814 ((imm) & (1 << 2)) ? (uint16_t) - 1 : 0x0, \
6815 ((imm) & (1 << 3)) ? (uint16_t) - 1 : 0x0, \
6816 ((imm) & (1 << 4)) ? (uint16_t) - 1 : 0x0, \
6817 ((imm) & (1 << 5)) ? (uint16_t) - 1 : 0x0, \
6818 ((imm) & (1 << 6)) ? (uint16_t) - 1 : 0x0, \
6819 ((imm) & (1 << 7)) ? (uint16_t) - 1 : 0x0); \
6820 uint16x8_t _mask_vec = vld1q_u16(_mask); \
6821 uint16x8_t __a = vreinterpretq_u16_m128i(_a); \
6822 uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \
6823 vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, __b, __a)));)
6828#define _mm_blend_pd(a, b, imm) \
6829 _sse2neon_define2( \
6831 const uint64_t _mask[2] = \
6832 _sse2neon_init(((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \
6833 ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)); \
6834 uint64x2_t _mask_vec = vld1q_u64(_mask); \
6835 uint64x2_t __a = vreinterpretq_u64_m128d(_a); \
6836 uint64x2_t __b = vreinterpretq_u64_m128d(_b); _sse2neon_return( \
6837 vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, __b, __a)));)
6845 (imm8 & (1 << 0)) ? UINT32_MAX : 0, (imm8 & (1 << 1)) ? UINT32_MAX : 0,
6846 (imm8 & (1 << 2)) ? UINT32_MAX : 0, (imm8 & (1 << 3)) ? UINT32_MAX : 0};
6847 uint32x4_t mask = vld1q_u32(data);
6873#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6874 float64x2_t a = vreinterpretq_f64_m128d(_a);
6875 float64x2_t b = vreinterpretq_f64_m128d(_b);
6876 return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
6903#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6904 return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
6919#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
6920 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
6923 float *f = (
float *) &a;
6924 return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
6952#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6960 uint32x4_t swapped = vrev64q_u32(cmp);
6980 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8));
6981 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4));
7000 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));
7010 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));
7011 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8));
7021 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));
7022 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8));
7023 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4));
7042 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8));
7043 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4));
7062 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));
7072 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));
7073 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8));
7083 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));
7084 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8));
7085 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4));
7096 const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
7097 const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
7098#if !SSE2NEON_PRECISE_DP
7099 const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
7100 const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
7103#if !SSE2NEON_PRECISE_DP
7109#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
7110 double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
7111 vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
7113 double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
7114 vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
7125 double d0 = (imm & 0x10) ? a0 * b0 : 0;
7126 double d1 = (imm & 0x20) ? a1 * b1 : 0;
7131#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
7132 double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
7138 double sum = _tmp0 + _tmp1;
7153 float32x4_t elementwise_prod =
_mm_mul_ps(a, b);
7155#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
7161 if ((imm & 0x0F) == 0x0F) {
7162 if (!(imm & (1 << 4)))
7163 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 0);
7164 if (!(imm & (1 << 5)))
7165 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 1);
7166 if (!(imm & (1 << 6)))
7167 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 2);
7168 if (!(imm & (1 << 7)))
7169 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 3);
7178 s += vgetq_lane_f32(elementwise_prod, 0);
7180 s += vgetq_lane_f32(elementwise_prod, 1);
7182 s += vgetq_lane_f32(elementwise_prod, 2);
7184 s += vgetq_lane_f32(elementwise_prod, 3);
7186 const float32_t res[4] = {
7187 (imm & 0x1) ? s : 0.0f,
7188 (imm & 0x2) ? s : 0.0f,
7189 (imm & 0x4) ? s : 0.0f,
7190 (imm & 0x8) ? s : 0.0f,
7199#define _mm_extract_epi32(a, imm) \
7200 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
7206#define _mm_extract_epi64(a, imm) \
7207 vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
7213#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
7217#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
7225#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
7226 return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
7241#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
7242 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7245 float *f = (
float *) &a;
7246 return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
7275#define _mm_insert_epi32(a, b, imm) \
7276 vreinterpretq_m128i_s32( \
7277 vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm)))
7284#define _mm_insert_epi64(a, b, imm) \
7285 vreinterpretq_m128i_s64( \
7286 vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm)))
7293#define _mm_insert_epi8(a, b, imm) \
7294 vreinterpretq_m128i_s8(vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm)))
7300#define _mm_insert_ps(a, b, imm8) \
7301 _sse2neon_define2( \
7303 float32x4_t tmp1 = \
7304 vsetq_lane_f32(vgetq_lane_f32(_b, ((imm8) >> 6) & 0x3), \
7305 vreinterpretq_f32_m128(_a), 0); \
7306 float32x4_t tmp2 = \
7307 vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), \
7308 vreinterpretq_f32_m128(_a), (((imm8) >> 4) & 0x3)); \
7309 const uint32_t data[4] = \
7310 _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
7311 ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
7312 ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
7313 ((imm8) & (1 << 3)) ? UINT32_MAX : 0); \
7314 uint32x4_t mask = vld1q_u32(data); \
7315 float32x4_t all_zeros = vdupq_n_f32(0); \
7317 _sse2neon_return(vreinterpretq_m128_f32( \
7318 vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))));)
7398 uint16_t min, idx = 0;
7399#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
7404 static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
7405 uint16x8_t minv = vdupq_n_u16(min);
7407 idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
7421 for (i = 0; i < 8; i++) {
7449 switch (imm & 0x4) {
7459#if defined(__GNUC__) || defined(__clang__)
7460 __builtin_unreachable();
7461#elif defined(_MSC_VER)
7467 switch (imm & 0x3) {
7469 _b = vreinterpretq_u8_u32(
7473 _b = vreinterpretq_u8_u32(
7477 _b = vreinterpretq_u8_u32(
7481 _b = vreinterpretq_u8_u32(
7485#if defined(__GNUC__) || defined(__clang__)
7486 __builtin_unreachable();
7487#elif defined(_MSC_VER)
7493 int16x8_t c04, c15, c26, c37;
7494 uint8x8_t low_b = vget_low_u8(_b);
7495 c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
7496 uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
7497 c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
7498 uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
7499 c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
7500 uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
7501 c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
7502#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
7504 c04 = vpaddq_s16(c04, c26);
7506 c15 = vpaddq_s16(c15, c37);
7509 vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
7511 vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
7513 vreinterpretq_s16_s32(trn2_c)));
7515 int16x4_t c01, c23, c45, c67;
7516 c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
7517 c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
7518 c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
7519 c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
7522 vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
7562#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
7565 return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
7571 return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
7573 return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
7576 double *v_double = (
double *) &a;
7582 for (
int i = 0; i < 2; i++) {
7583 tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
7584 double roundDown = floor(tmp);
7585 double roundUp = ceil(tmp);
7586 double diffDown = tmp - roundDown;
7587 double diffUp = roundUp - tmp;
7588 if (diffDown < diffUp) {
7591 }
else if (diffDown > diffUp) {
7597 double half = roundDown / 2;
7598 if (half != floor(half)) {
7608 res[i] = (v_double[i] < 0) ? -res[i] : res[i];
7620 return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
7621 v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
7631#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
7632 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7646 float *v_float = (
float *) &a;
7651 uint32x4_t signmask = vdupq_n_u32(0x80000000);
7654 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
7656 int32x4_t r_trunc = vcvtq_s32_f32(
7658 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
7659 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31));
7660 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
7662 float32x4_t delta = vsubq_f32(
7664 vcvtq_f32_s32(r_trunc));
7665 uint32x4_t is_delta_half =
7666 vceqq_f32(delta, half);
7668 vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
7678 return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
7679 v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
7680 v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
7681 v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
7721#if __has_builtin(__builtin_nontemporal_store)
7722 return __builtin_nontemporal_load(p);
7733 return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
7742 int64x2_t a_and_mask =
7744 return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
7760 uint64x2_t ones = vandq_u64(m, v);
7761 uint64x2_t zeros = vbicq_u64(m, v);
7765 uint32x2_t reduced = vpmax_u32(vqmovn_u64(ones), vqmovn_u64(zeros));
7768 return (vget_lane_u32(vpmin_u32(reduced, reduced), 0) != 0);
7780 return !(vgetq_lane_s64(s64_vec, 0) | vgetq_lane_s64(s64_vec, 1));
7789#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
7800 return !(vgetq_lane_s64(s64_vec, 0) | vgetq_lane_s64(s64_vec, 1));
7805static const uint16_t
ALIGN_STRUCT(16) _sse2neon_cmpestr_mask16b[8] = {
7806 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
7808static const uint8_t
ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = {
7809 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
7810 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
7814#define _SIDD_UBYTE_OPS 0x00
7815#define _SIDD_UWORD_OPS 0x01
7816#define _SIDD_SBYTE_OPS 0x02
7817#define _SIDD_SWORD_OPS 0x03
7820#define _SIDD_CMP_EQUAL_ANY 0x00
7821#define _SIDD_CMP_RANGES 0x04
7822#define _SIDD_CMP_EQUAL_EACH 0x08
7823#define _SIDD_CMP_EQUAL_ORDERED 0x0C
7826#define _SIDD_POSITIVE_POLARITY 0x00
7827#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
7828#define _SIDD_NEGATIVE_POLARITY 0x10
7829#define _SIDD_MASKED_NEGATIVE_POLARITY \
7833#define _SIDD_LEAST_SIGNIFICANT 0x00
7834#define _SIDD_MOST_SIGNIFICANT 0x40
7837#define _SIDD_BIT_MASK 0x00
7838#define _SIDD_UNIT_MASK 0x40
7845#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
7846#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)
7848#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)
7850#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__
7852#define SSE2NEON_IIF_1(t, ...) t
7854#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)
7855#define SSE2NEON_COMPL_0 1
7856#define SSE2NEON_COMPL_1 0
7858#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)
7859#define SSE2NEON_DEC_1 0
7860#define SSE2NEON_DEC_2 1
7861#define SSE2NEON_DEC_3 2
7862#define SSE2NEON_DEC_4 3
7863#define SSE2NEON_DEC_5 4
7864#define SSE2NEON_DEC_6 5
7865#define SSE2NEON_DEC_7 6
7866#define SSE2NEON_DEC_8 7
7867#define SSE2NEON_DEC_9 8
7868#define SSE2NEON_DEC_10 9
7869#define SSE2NEON_DEC_11 10
7870#define SSE2NEON_DEC_12 11
7871#define SSE2NEON_DEC_13 12
7872#define SSE2NEON_DEC_14 13
7873#define SSE2NEON_DEC_15 14
7874#define SSE2NEON_DEC_16 15
7877#define SSE2NEON_CHECK_N(x, n, ...) n
7878#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )
7879#define SSE2NEON_PROBE(x) x, 1,
7881#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))
7882#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)
7884#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))
7885#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))
7887#define SSE2NEON_EAT(...)
7888#define SSE2NEON_EXPAND(...) __VA_ARGS__
7889#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)
7893#define SSE2NEON_EMPTY()
7894#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()
7895#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()
7896#define SSE2NEON_EXPAND(...) __VA_ARGS__
7898#define SSE2NEON_EVAL(...) \
7899 SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))
7900#define SSE2NEON_EVAL1(...) \
7901 SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))
7902#define SSE2NEON_EVAL2(...) \
7903 SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))
7904#define SSE2NEON_EVAL3(...) __VA_ARGS__
7906#define SSE2NEON_REPEAT(count, macro, ...) \
7907 SSE2NEON_WHEN(count) \
7908 (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()( \
7909 SSE2NEON_DEC(count), macro, \
7910 __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \
7912#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT
7914#define SSE2NEON_SIZE_OF_byte 8
7915#define SSE2NEON_NUMBER_OF_LANES_byte 16
7916#define SSE2NEON_SIZE_OF_word 16
7917#define SSE2NEON_NUMBER_OF_LANES_word 8
7919#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type) \
7920 mtx[i] = vreinterpretq_m128i_##type(vceqq_##type( \
7921 vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \
7922 vreinterpretq_##type##_m128i(a)));
7924#define SSE2NEON_FILL_LANE(i, type) \
7926 vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
7928#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size, \
7929 number_of_lanes, byte_or_word) \
7933 SSE2NEON_CAT(size, \
7934 SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \
7935 vec_b[number_of_lanes]; \
7936 __m128i mask = SSE2NEON_IIF(byte_or_word)( \
7937 vreinterpretq_m128i_u16(vdupq_n_u16(0xff)), \
7938 vreinterpretq_m128i_u32(vdupq_n_u32(0xffff))); \
7939 SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE, \
7940 SSE2NEON_CAT(type_prefix, size))) \
7941 for (int i = 0; i < number_of_lanes; i++) { \
7942 mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u, \
7943 size)(SSE2NEON_CAT(vbslq_u, size)( \
7944 SSE2NEON_CAT(vreinterpretq_u, \
7945 SSE2NEON_CAT(size, _m128i))(mask), \
7946 SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))( \
7950 SSE2NEON_CAT(type_prefix, \
7951 SSE2NEON_CAT(size, _m128i(a))))), \
7952 SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))( \
7956 SSE2NEON_CAT(type_prefix, \
7957 SSE2NEON_CAT(size, _m128i(a))))))); \
7961#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes) \
7963 SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, \
7964 SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \
7965 SSE2NEON_CAT(u, size))) \
7968#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type) \
7969 static uint16_t _sse2neon_cmp_##type##_equal_any(__m128i a, int la, \
7970 __m128i b, int lb) \
7973 PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7974 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \
7975 return SSE2NEON_CAT( \
7976 _sse2neon_aggregate_equal_any_, \
7978 SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7979 SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \
7980 type))))(la, lb, mtx); \
7983#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word) \
7984 static uint16_t _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, \
7985 __m128i b, int lb) \
7989 a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7990 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word); \
7991 return SSE2NEON_CAT( \
7992 _sse2neon_aggregate_ranges_, \
7994 SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7995 SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \
7996 type))))(la, lb, mtx); \
7999#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type) \
8000 static uint16_t _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la, \
8001 __m128i b, int lb) \
8004 PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
8005 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \
8006 return SSE2NEON_CAT( \
8007 _sse2neon_aggregate_equal_ordered_, \
8009 SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
8011 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \
8012 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx); \
8015static uint16_t _sse2neon_aggregate_equal_any_8x16(
int la,
8020 int m = (1 << la) - 1;
8021 uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
8022 uint8x8_t t_lo = vtst_u8(vdup_n_u8((uint8_t) (m & 0xff)), vec_mask);
8023 uint8x8_t t_hi = vtst_u8(vdup_n_u8((uint8_t) (m >> 8)), vec_mask);
8024 uint8x16_t vec = vcombine_u8(t_lo, t_hi);
8025 for (
int j = 0; j < lb; j++) {
8037static uint16_t _sse2neon_aggregate_equal_any_16x8(
int la,
8042 uint16_t m = (uint16_t) (1 << la) - 1;
8044 vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
8045 for (
int j = 0; j < lb; j++) {
8058#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \
8059 prefix##IMPL(byte) \
8065static uint16_t _sse2neon_aggregate_ranges_16x8(
int la,
int lb,
__m128i mtx[16])
8068 uint16_t m = (uint16_t) (1 << la) - 1;
8070 vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
8071 for (
int j = 0; j < lb; j++) {
8080#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
8081 uint16_t t = vaddvq_u32(vec_res) ? 1 : 0;
8083 uint64x2_t sumh = vpaddlq_u32(vec_res);
8084 uint16_t t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
8091static uint16_t _sse2neon_aggregate_ranges_8x16(
int la,
int lb,
__m128i mtx[16])
8094 uint16_t m = (uint16_t) ((1 << la) - 1);
8095 uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
8096 uint8x8_t t_lo = vtst_u8(vdup_n_u8((uint8_t) (m & 0xff)), vec_mask);
8097 uint8x8_t t_hi = vtst_u8(vdup_n_u8((uint8_t) (m >> 8)), vec_mask);
8098 uint8x16_t vec = vcombine_u8(t_lo, t_hi);
8099 for (
int j = 0; j < lb; j++) {
8114#define SSE2NEON_CMP_RANGES_IS_BYTE 1
8115#define SSE2NEON_CMP_RANGES_IS_WORD 0
8118#define SSE2NEON_GENERATE_CMP_RANGES(prefix) \
8119 prefix##IMPL(byte, uint, u, prefix##IS_BYTE) \
8120 prefix##IMPL(byte, int, s, prefix##IS_BYTE) \
8121 prefix##IMPL(word, uint, u, prefix##IS_WORD) \
8122 prefix##IMPL(word, int, s, prefix##IS_WORD)
8127#undef SSE2NEON_CMP_RANGES_IS_BYTE
8128#undef SSE2NEON_CMP_RANGES_IS_WORD
8130static uint16_t _sse2neon_cmp_byte_equal_each(
__m128i a,
8137 uint16_t m0 = (la < lb) ? 0 : (uint16_t) ((1 << la) - (1 << lb));
8138 uint16_t m1 = (uint16_t) (0x10000 - (1 << la));
8139 uint16_t tb = (uint16_t) (0x10000 - (1 << lb));
8140 uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
8141 uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
8142 vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
8143 vec0_lo = vtst_u8(vdup_n_u8((uint8_t) m0), vec_mask);
8144 vec0_hi = vtst_u8(vdup_n_u8((uint8_t) (m0 >> 8)), vec_mask);
8145 vec1_lo = vtst_u8(vdup_n_u8((uint8_t) m1), vec_mask);
8146 vec1_hi = vtst_u8(vdup_n_u8((uint8_t) (m1 >> 8)), vec_mask);
8147 tmp_lo = vtst_u8(vdup_n_u8((uint8_t) tb), vec_mask);
8148 tmp_hi = vtst_u8(vdup_n_u8((uint8_t) (tb >> 8)), vec_mask);
8150 res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
8151 res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
8152 res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
8153 res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
8154 res_lo = vand_u8(res_lo, vec_mask);
8155 res_hi = vand_u8(res_hi, vec_mask);
8161static uint16_t _sse2neon_cmp_word_equal_each(
__m128i a,
8168 uint16_t m0 = (uint16_t) ((la < lb) ? 0 : ((1 << la) - (1 << lb)));
8169 uint16_t m1 = (uint16_t) (0x100 - (1 << la));
8170 uint16_t tb = (uint16_t) (0x100 - (1 << lb));
8171 uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
8172 uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
8173 uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
8174 uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
8175 mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
8176 mtx = vbslq_u16(vec1, tmp, mtx);
8177 mtx = vandq_u16(mtx, vec_mask);
8181#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
8182#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
8184#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type) \
8186 _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes( \
8187 int bound, int la, int lb, __m128i mtx[16]) \
8191 (uint16_t) (SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la)); \
8192 uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)( \
8193 vld1_u##size(_sse2neon_cmpestr_mask##size##b), \
8194 vld1q_u##size(_sse2neon_cmpestr_mask##size##b)); \
8195 uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)( \
8197 vtst_u##size(vdup_n_u##size((uint##size##_t) m1), vec_mask), \
8198 vtst_u##size(vdup_n_u##size((uint##size##_t)(m1 >> 8)), \
8200 vtstq_u##size(vdupq_n_u##size((uint##size##_t) m1), vec_mask)); \
8201 uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
8202 uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0); \
8203 for (int j = 0; j < lb; j++) { \
8204 mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size( \
8205 vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j]))); \
8207 for (int j = lb; j < bound; j++) { \
8208 mtx[j] = vreinterpretq_m128i_u##size( \
8209 vbslq_u##size(vec1, vec_minusone, vec_zero)); \
8211 unsigned SSE2NEON_IIF(data_type)(char, short) *ptr = \
8212 (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx; \
8213 for (int i = 0; i < bound; i++) { \
8215 for (int j = 0, k = i; j < bound - i && k < bound; j++, k++) \
8216 val &= ptr[k * bound + j]; \
8217 res += (uint16_t) (val << i); \
8223#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
8224 prefix##IMPL(8, 16, prefix##IS_UBYTE) \
8225 prefix##IMPL(16, 8, prefix##IS_UWORD)
8230#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
8231#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
8234#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
8235 prefix##IMPL(byte) \
8241#define SSE2NEON_CMPESTR_LIST \
8242 _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any) \
8243 _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any) \
8244 _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any) \
8245 _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any) \
8246 _(CMP_UBYTE_RANGES, cmp_ubyte_ranges) \
8247 _(CMP_UWORD_RANGES, cmp_uword_ranges) \
8248 _(CMP_SBYTE_RANGES, cmp_sbyte_ranges) \
8249 _(CMP_SWORD_RANGES, cmp_sword_ranges) \
8250 _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each) \
8251 _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each) \
8252 _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each) \
8253 _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each) \
8254 _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
8255 _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
8256 _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
8257 _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
8260#define _(name, func_suffix) name,
8266#define _(name, func_suffix) _sse2neon_##func_suffix,
8276 switch (imm8 & 0x30) {
8281 res ^= (1 << lb) - 1;
8287 return (uint16_t) (res & ((bound == 8) ? 0xFF : 0xFFFF));
8292#if defined(_MSC_VER) && !defined(__clang__)
8293 unsigned long cnt = 0;
8294 if (_BitScanReverse(&cnt, x))
8298 return x != 0 ? __builtin_clz(x) : 32;
8304#if defined(_MSC_VER) && !defined(__clang__)
8305 unsigned long cnt = 0;
8306 if (_BitScanForward(&cnt, x))
8310 return x != 0 ? __builtin_ctz(x) : 32;
8318#if defined(SSE2NEON_HAS_BITSCAN64)
8319 if (_BitScanForward64(&cnt, x))
8322 if (_BitScanForward(&cnt, (
unsigned long) (x)))
8324 if (_BitScanForward(&cnt, (
unsigned long) (x >> 32)))
8325 return (
int) (cnt + 32);
8329 return x != 0 ? __builtin_ctzll(x) : 64;
8333#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
8335#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
8336 const int var = ((imm) & 0x01) ? 8 : 16
8338#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
8339 int tmp1 = la ^ (la >> 31); \
8340 la = tmp1 - (la >> 31); \
8341 int tmp2 = lb ^ (lb >> 31); \
8342 lb = tmp2 - (lb >> 31); \
8343 la = SSE2NEON_MIN(la, bound); \
8344 lb = SSE2NEON_MIN(lb, bound)
8351#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE) \
8352 SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); \
8353 SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb); \
8354 uint16_t r2 = (_sse2neon_cmpfunc_table[(imm8) & 0x0f])(a, la, b, lb); \
8355 r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
8357#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8) \
8358 return (r2 == 0) ? bound \
8359 : (((imm8) & 0x40) ? (31 - _sse2neon_clz(r2)) \
8360 : _sse2neon_ctz(r2))
8362#define SSE2NEON_CMPSTR_GENERATE_MASK(dst) \
8363 __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
8364 if ((imm8) & 0x40) { \
8366 uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2), \
8367 vld1q_u16(_sse2neon_cmpestr_mask16b)); \
8368 dst = vreinterpretq_m128i_u16(vbslq_u16( \
8369 tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst))); \
8371 uint8x16_t vec_r2 = vcombine_u8(vdup_n_u8((uint8_t) r2), \
8372 vdup_n_u8((uint8_t) (r2 >> 8))); \
8374 vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b)); \
8375 dst = vreinterpretq_m128i_u8( \
8376 vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst))); \
8379 if (bound == 16) { \
8380 dst = vreinterpretq_m128i_u16( \
8381 vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
8383 dst = vreinterpretq_m128i_u8(vsetq_lane_u8( \
8384 (uint8_t) (r2 & 0xff), vreinterpretq_u8_m128i(dst), 0)); \
8401 return !r2 & (lb_cpy > bound);
8466 return la <= (bound - 1);
8482 return lb <= (bound - 1);
8485#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8) \
8487 if ((imm8) & 0x01) { \
8488 uint16x8_t equal_mask_##str = \
8489 vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
8490 uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \
8491 uint64_t matches_##str = \
8492 vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \
8493 len = _sse2neon_ctzll(matches_##str) >> 3; \
8495 uint16x8_t equal_mask_##str = vreinterpretq_u16_u8( \
8496 vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0))); \
8497 uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \
8498 uint64_t matches_##str = \
8499 vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \
8500 len = _sse2neon_ctzll(matches_##str) >> 2; \
8504#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \
8507 SSE2NEON_CMPISTRX_LENGTH(a, la, imm8); \
8508 SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8); \
8518 return !r2 & (lb >= bound);
8566 return la <= (bound - 1);
8578 return lb <= (bound - 1);
8585#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
8600#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8601 __asm__ __volatile__(
"crc32ch %w[c], %w[c], %w[v]\n\t"
8604#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
8605 ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__))
8606 crc = __crc32ch(crc, v);
8619#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8620 __asm__ __volatile__(
"crc32cw %w[c], %w[c], %w[v]\n\t"
8623#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
8624 ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__))
8625 crc = __crc32cw(crc, v);
8638#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8639 __asm__ __volatile__(
"crc32cx %w[c], %w[c], %x[v]\n\t"
8642#elif ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__))
8643 crc = __crc32cd((uint32_t) crc, v);
8645 crc =
_mm_crc32_u32((uint32_t) (crc), (uint32_t) (v & 0xffffffff));
8646 crc =
_mm_crc32_u32((uint32_t) (crc), (uint32_t) ((v >> 32) & 0xffffffff));
8656#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8657 __asm__ __volatile__(
"crc32cb %w[c], %w[c], %w[v]\n\t"
8660#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
8661 ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__))
8662 crc = __crc32cb(crc, v);
8665#if defined(__ARM_FEATURE_CRYPTO)
8669 vcombine_u64(vcreate_u64((uint64_t) (crc) << 24), vcreate_u64(0x0));
8670 uint64x2_t tmp = orig;
8673 uint64_t p = 0x105EC76F1;
8676 uint64_t mu = 0x1dea713f1;
8679 tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(mu));
8682 vandq_u64(tmp, vcombine_u64(vcreate_u64(0xFFFFFFFF), vcreate_u64(0x0)));
8684 tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(p));
8686 tmp = veorq_u64(tmp, orig);
8689 crc = vgetq_lane_u32(vreinterpretq_u32_u64(tmp), 1);
8697 static const uint32_t crc32_half_byte_tbl[] = {
8698 0x00000000, 0x105ec76f, 0x20bd8ede, 0x30e349b1, 0x417b1dbc, 0x5125dad3,
8699 0x61c69362, 0x7198540d, 0x82f63b78, 0x92a8fc17, 0xa24bb5a6, 0xb21572c9,
8700 0xc38d26c4, 0xd3d3e1ab, 0xe330a81a, 0xf36e6f75,
8703 crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F];
8704 crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F];
8712#if !defined(__ARM_FEATURE_CRYPTO) && \
8713 ((!defined(_M_ARM64) && !defined(_M_ARM64EC)) || defined(__clang__))
8715#define SSE2NEON_AES_SBOX(w) \
8717 w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
8718 w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
8719 w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
8720 w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
8721 w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
8722 w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
8723 w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
8724 w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
8725 w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
8726 w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
8727 w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
8728 w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
8729 w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
8730 w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
8731 w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
8732 w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
8733 w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
8734 w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
8735 w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
8736 w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
8737 w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
8738 w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
8739 w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
8740 w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
8741 w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
8742 w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
8743 w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
8744 w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
8745 w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
8746 w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
8747 w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
8748 w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
8749 w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
8750 w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
8751 w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
8752 w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
8753 w(0xb0), w(0x54), w(0xbb), w(0x16) \
8755#define SSE2NEON_AES_RSBOX(w) \
8757 w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
8758 w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
8759 w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
8760 w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
8761 w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
8762 w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
8763 w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
8764 w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
8765 w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
8766 w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
8767 w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
8768 w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
8769 w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
8770 w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
8771 w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
8772 w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
8773 w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
8774 w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
8775 w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
8776 w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
8777 w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
8778 w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
8779 w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
8780 w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
8781 w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
8782 w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
8783 w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
8784 w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
8785 w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
8786 w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
8787 w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
8788 w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
8789 w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
8790 w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
8791 w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
8792 w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
8793 w(0x55), w(0x21), w(0x0c), w(0x7d) \
8798#define SSE2NEON_AES_H0(x) (x)
8801#undef SSE2NEON_AES_H0
8804#if !defined(__aarch64__)
8805#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
8806#define SSE2NEON_MULTIPLY(x, y) \
8807 (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^ \
8808 ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^ \
8809 ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
8810 ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
8820#if defined(__aarch64__)
8821 static const uint8_t shift_rows[] = {
8822 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
8823 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
8825 static const uint8_t ror32by8[] = {
8826 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8827 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8834 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8849 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8850 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8851 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8857#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
8858 (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
8859 ((uint32_t) (b1) << 8) | (uint32_t) (b0))
8861#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b ))
8863#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
8864#define SSE2NEON_AES_U0(p) \
8865 SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
8866#define SSE2NEON_AES_U1(p) \
8867 SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
8868#define SSE2NEON_AES_U2(p) \
8869 SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
8870#define SSE2NEON_AES_U3(p) \
8871 SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
8875 static const uint32_t
ALIGN_STRUCT(16) aes_table[4][256] = {
8881#undef SSE2NEON_AES_B2W
8882#undef SSE2NEON_AES_F2
8883#undef SSE2NEON_AES_F3
8884#undef SSE2NEON_AES_U0
8885#undef SSE2NEON_AES_U1
8886#undef SSE2NEON_AES_U2
8887#undef SSE2NEON_AES_U3
8899 (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
8900 aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
8901 (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
8902 aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
8903 (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
8904 aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
8905 (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
8906 aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
8917#if defined(__aarch64__)
8918 static const uint8_t inv_shift_rows[] = {
8919 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
8920 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
8922 static const uint8_t ror32by8[] = {
8923 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8924 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8931 w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
8941 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8942 w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
8944 v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
8946 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
8948 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8949 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8956 uint8_t i, e, f, g, h, v[4][4];
8957 uint8_t *_a = (uint8_t *) &a;
8958 for (i = 0; i < 16; ++i) {
8959 v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
8963 for (i = 0; i < 4; ++i) {
8989#if defined(__aarch64__)
8990 static const uint8_t shift_rows[] = {
8991 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
8992 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
8999 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
9039#if defined(__aarch64__)
9040 static const uint8_t inv_shift_rows[] = {
9041 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
9042 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
9049 w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
9063 uint8_t *_a = (uint8_t *) &a;
9064 for (
int i = 0; i < 16; ++i) {
9065 v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
9077#if defined(__aarch64__)
9078 static const uint8_t ror32by8[] = {
9079 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
9080 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
9086 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
9087 w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
9089 v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
9092 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
9093 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
9094 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
9098 uint8_t i, e, f, g, h, v[4][4];
9100 for (i = 0; i < 4; ++i) {
9131#if defined(__aarch64__)
9138 uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
9139 uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
9140 uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));
9147 for (
int i = 0; i < 4; ++i) {
9148 ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
9149 ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
9152 ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
9155#undef SSE2NEON_AES_SBOX
9156#undef SSE2NEON_AES_RSBOX
9158#if defined(__aarch64__)
9160#undef SSE2NEON_MULTIPLY
9223#if !defined(_MSC_VER) || defined(__clang__)
9226 u8[0x4], u8[0x1], u8[0xE], u8[0xB],
9227 u8[0x1], u8[0xE], u8[0xB], u8[0x4],
9228 u8[0xC], u8[0x9], u8[0x6], u8[0x3],
9229 u8[0x9], u8[0x6], u8[0x3], u8[0xC],
9231 uint32x4_t r = {0, (unsigned) rcon, 0, (
unsigned) rcon};
9240 ((uint64_t) u8.n128_u8[0x4] << 0) | ((uint64_t) u8.n128_u8[0x1] << 8) |
9241 ((uint64_t) u8.n128_u8[0xE] << 16) |
9242 ((uint64_t) u8.n128_u8[0xB] << 24) |
9243 ((uint64_t) u8.n128_u8[0x1] << 32) |
9244 ((uint64_t) u8.n128_u8[0xE] << 40) |
9245 ((uint64_t) u8.n128_u8[0xB] << 48) |
9246 ((uint64_t) u8.n128_u8[0x4] << 56),
9247 ((uint64_t) u8.n128_u8[0xC] << 0) | ((uint64_t) u8.n128_u8[0x9] << 8) |
9248 ((uint64_t) u8.n128_u8[0x6] << 16) |
9249 ((uint64_t) u8.n128_u8[0x3] << 24) |
9250 ((uint64_t) u8.n128_u8[0x9] << 32) |
9251 ((uint64_t) u8.n128_u8[0x6] << 40) |
9252 ((uint64_t) u8.n128_u8[0x3] << 48) |
9253 ((uint64_t) u8.n128_u8[0xC] << 56)};
9255 dest.n128_u32[1] = dest.n128_u32[1] ^ rcon;
9256 dest.n128_u32[3] = dest.n128_u32[3] ^ rcon;
9272 switch (imm & 0x11) {
9275 _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
9278 _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
9281 _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
9284 _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
9294#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
9301#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
9304 __asm__ __volatile__(
"vmrs %0, FPSCR" :
"=r"(r.value));
9315#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
9316#if __has_builtin(__builtin_popcount)
9317 return __builtin_popcount(a);
9318#elif defined(_MSC_VER)
9319 return _CountOneBits(a);
9321 return (
int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
9325 uint8x8_t input_val, count8x8_val;
9326 uint16x4_t count16x4_val;
9327 uint32x2_t count32x2_val;
9329 input_val = vld1_u8((uint8_t *) &a);
9330 count8x8_val = vcnt_u8(input_val);
9331 count16x4_val = vpaddl_u8(count8x8_val);
9332 count32x2_val = vpaddl_u16(count16x4_val);
9334 vst1_u32(&count, count32x2_val);
9344#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
9345#if __has_builtin(__builtin_popcountll)
9346 return __builtin_popcountll(a);
9347#elif defined(_MSC_VER)
9348 return _CountOneBits64(a);
9350 return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
9354 uint8x8_t input_val, count8x8_val;
9355 uint16x4_t count16x4_val;
9356 uint32x2_t count32x2_val;
9357 uint64x1_t count64x1_val;
9359 input_val = vld1_u8((uint8_t *) &a);
9360 count8x8_val = vcnt_u8(input_val);
9361 count16x4_val = vpaddl_u8(count8x8_val);
9362 count32x2_val = vpaddl_u16(count16x4_val);
9363 count64x1_val = vpaddl_u32(count32x2_val);
9364 vst1_u64(&count, count64x1_val);
9375#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
9382#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
9385 __asm__ __volatile__(
"vmrs %0, FPSCR" :
"=r"(r.value));
9390#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
9393 __asm__ __volatile__(
"vmsr FPSCR, %0" ::
"r"(r));
9401#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
9410#if defined(_MSC_VER) && !defined(__clang__)
9411 val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2));
9413 __asm__ __volatile__(
"mrs %0, cntvct_el0" :
"=r"(val));
9418 uint32_t pmccntr, pmuseren, pmcntenset;
9421 __asm__ __volatile__(
"mrc p15, 0, %0, c9, c14, 0" :
"=r"(pmuseren));
9423 __asm__ __volatile__(
"mrc p15, 0, %0, c9, c12, 1" :
"=r"(pmcntenset));
9424 if (pmcntenset & 0x80000000UL) {
9425 __asm__ __volatile__(
"mrc p15, 0, %0, c9, c13, 0" :
"=r"(pmccntr));
9427 return (uint64_t) (pmccntr) << 6;
9433 gettimeofday(&tv, NULL);
9434 return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
9438#if defined(__GNUC__) || defined(__clang__)
9439#pragma pop_macro("ALIGN_STRUCT")
9440#pragma pop_macro("FORCE_INLINE")
9443#if defined(__GNUC__) && !defined(__clang__)
9444#pragma GCC pop_options
FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
Definition sse2neon.h:3787
FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
Definition sse2neon.h:2949
FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
Definition sse2neon.h:1423
FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
Definition sse2neon.h:2428
FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
Definition sse2neon.h:1490
FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
Definition sse2neon.h:1387
FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
Definition sse2neon.h:6161
FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
Definition sse2neon.h:6217
#define vreinterpret_m64_f32(x)
Definition sse2neon.h:496
FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
Definition sse2neon.h:6950
#define SSE2NEON_CMPSTR_GENERATE_MASK(dst)
Definition sse2neon.h:8362
FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
Definition sse2neon.h:678
#define _MM_FROUND_TO_POS_INF
Definition sse2neon.h:372
#define vreinterpretq_u32_m128d(x)
Definition sse2neon.h:538
#define vreinterpret_m64_s32(x)
Definition sse2neon.h:487
FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
Definition sse2neon.h:5845
FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
Definition sse2neon.h:5110
#define SSE2NEON_CACHELINE_SIZE
Definition sse2neon.h:288
FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
Definition sse2neon.h:1585
FORCE_INLINE unsigned int _mm_getcsr(void)
Definition sse2neon.h:2530
FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
Definition sse2neon.h:4899
#define vreinterpretq_m128_s32(x)
Definition sse2neon.h:445
FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
Definition sse2neon.h:7332
FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
Definition sse2neon.h:4158
FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
Definition sse2neon.h:3341
FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
Definition sse2neon.h:5333
FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
Definition sse2neon.h:1242
FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
Definition sse2neon.h:4567
FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
Definition sse2neon.h:2055
FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
Definition sse2neon.h:3250
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition sse2neon.h:2835
FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
Definition sse2neon.h:4295
FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
Definition sse2neon.h:3080
FORCE_INLINE __m128d _mm_setzero_pd(void)
Definition sse2neon.h:5152
#define SSE2NEON_AES_U2(p)
FORCE_INLINE __m128i _mm_set1_epi16(short w)
Definition sse2neon.h:5039
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition sse2neon.h:5578
FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
Definition sse2neon.h:1633
#define vreinterpretq_m128_f32(x)
Definition sse2neon.h:435
float32x4_t __m128
Definition sse2neon.h:406
FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
Definition sse2neon.h:4043
FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
Definition sse2neon.h:3053
FORCE_INLINE int _mm_cmpestrs(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition sse2neon.h:8456
FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
Definition sse2neon.h:1970
FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
Definition sse2neon.h:2975
FORCE_INLINE void _mm_sfence(void)
Definition sse2neon.h:2582
FORCE_INLINE __m128 _mm_load_ss(const float *p)
Definition sse2neon.h:1914
FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
Definition sse2neon.h:3268
FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
Definition sse2neon.h:2822
FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
Definition sse2neon.h:3108
#define SSE2NEON_MULTIPLY(x, y)
Definition sse2neon.h:8806
#define SSE2NEON_GENERATE_CMP_RANGES(prefix)
Definition sse2neon.h:8118
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition sse2neon.h:4982
FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
Definition sse2neon.h:6040
FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
Definition sse2neon.h:7529
#define vreinterpretq_m128i_s8(x)
Definition sse2neon.h:462
FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
Definition sse2neon.h:2799
FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
Definition sse2neon.h:8987
FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
Definition sse2neon.h:6456
FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
Definition sse2neon.h:5972
FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
Definition sse2neon.h:1052
FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
Definition sse2neon.h:3071
#define vreinterpret_m64_s8(x)
Definition sse2neon.h:485
FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
Definition sse2neon.h:4130
FORCE_INLINE int _mm_cmpestri(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition sse2neon.h:8420
FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
Definition sse2neon.h:6306
#define _SIDD_MASKED_NEGATIVE_POLARITY
Definition sse2neon.h:7829
#define vreinterpret_u16_m64(x)
Definition sse2neon.h:500
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition sse2neon.h:6078
#define vreinterpretq_m128i_u64(x)
Definition sse2neon.h:470
FORCE_INLINE int _mm_cmpestrz(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition sse2neon.h:8472
FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int)
Definition sse2neon.h:4959
FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
Definition sse2neon.h:3716
FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
Definition sse2neon.h:637
FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
Definition sse2neon.h:5941
FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
Definition sse2neon.h:4029
FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
Definition sse2neon.h:6441
FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
Definition sse2neon.h:760
#define SSE2NEON_CMPSTR_SET_UPPER(var, imm)
Definition sse2neon.h:8335
uint16_t(* cmpestr_func_t)(__m128i a, int la, __m128i b, int lb)
Definition sse2neon.h:8264
FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
Definition sse2neon.h:4585
#define vreinterpret_m64_s64(x)
Definition sse2neon.h:488
FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
Definition sse2neon.h:8542
#define vreinterpretq_u8_m128i(x)
Definition sse2neon.h:480
FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
Definition sse2neon.h:5449
FORCE_INLINE __m128d _mm_ceil_pd(__m128d)
Definition sse2neon.h:6901
FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
Definition sse2neon.h:2741
#define vreinterpretq_s16_m128i(x)
Definition sse2neon.h:476
#define _MM_FLUSH_ZERO_MASK
Definition sse2neon.h:388
FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
Definition sse2neon.h:4817
#define vreinterpretq_u64_m128d(x)
Definition sse2neon.h:539
FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
Definition sse2neon.h:1368
FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
Definition sse2neon.h:6553
#define vreinterpretq_s8_m128(x)
Definition sse2neon.h:457
FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
Definition sse2neon.h:4754
FORCE_INLINE void _sse2neon_smp_mb(void)
Definition sse2neon.h:224
FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
Definition sse2neon.h:1251
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition sse2neon.h:2967
FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
Definition sse2neon.h:7549
FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
Definition sse2neon.h:2538
FORCE_INLINE __m128d _mm_load_pd(const double *p)
Definition sse2neon.h:4327
#define vreinterpretq_m128_u64(x)
Definition sse2neon.h:441
FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
Definition sse2neon.h:3500
FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
Definition sse2neon.h:3332
FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
Definition sse2neon.h:5861
#define vreinterpret_m64_u8(x)
Definition sse2neon.h:490
#define vreinterpretq_m128d_s32(x)
Definition sse2neon.h:528
FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
Definition sse2neon.h:2010
FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
Definition sse2neon.h:4799
FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
Definition sse2neon.h:2217
FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
Definition sse2neon.h:5400
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition sse2neon.h:1769
FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
Definition sse2neon.h:1500
FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
Definition sse2neon.h:2781
FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
Definition sse2neon.h:6689
FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
Definition sse2neon.h:3376
FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
Definition sse2neon.h:4424
FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
Definition sse2neon.h:8617
FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
Definition sse2neon.h:6747
FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
Definition sse2neon.h:4408
FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
Definition sse2neon.h:4437
FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
Definition sse2neon.h:777
FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
Definition sse2neon.h:3223
FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
Definition sse2neon.h:8598
FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
Definition sse2neon.h:5802
FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
Definition sse2neon.h:5145
FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
Definition sse2neon.h:8572
#define vreinterpretq_f32_m128i(x)
Definition sse2neon.h:472
FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
Definition sse2neon.h:1061
FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
Definition sse2neon.h:660
FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
Definition sse2neon.h:5877
FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
Definition sse2neon.h:5723
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition sse2neon.h:4445
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition sse2neon.h:3973
FORCE_INLINE int _mm_movemask_epi8(__m128i a)
Definition sse2neon.h:4653
FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
Definition sse2neon.h:5462
FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
Definition sse2neon.h:1560
#define vreinterpret_m64_u16(x)
Definition sse2neon.h:491
FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
Definition sse2neon.h:1550
FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
Definition sse2neon.h:1758
FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
Definition sse2neon.h:2416
FORCE_INLINE int _mm_test_all_ones(__m128i a)
Definition sse2neon.h:7731
FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i)
Definition sse2neon.h:3232
#define _MM_DENORMALS_ZERO_OFF
Definition sse2neon.h:394
FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
Definition sse2neon.h:3302
FORCE_INLINE __m128i _mm_setzero_si128(void)
Definition sse2neon.h:5163
FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
Definition sse2neon.h:3871
FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
Definition sse2neon.h:6356
FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
Definition sse2neon.h:6225
FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
Definition sse2neon.h:2806
FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
Definition sse2neon.h:6856
FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
Definition sse2neon.h:7690
FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
Definition sse2neon.h:3017
#define vreinterpretq_m128_u32(x)
Definition sse2neon.h:440
FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
Definition sse2neon.h:5592
FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
Definition sse2neon.h:2459
FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d)
Definition sse2neon.h:4643
FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
Definition sse2neon.h:3850
FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
Definition sse2neon.h:1270
FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
Definition sse2neon.h:3902
FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
Definition sse2neon.h:1278
#define SSE2NEON_AES_U0(p)
#define _sse2neon_const
Definition sse2neon.h:123
#define vreinterpretq_s32_m128i(x)
Definition sse2neon.h:477
#define vreinterpret_s64_m64(x)
Definition sse2neon.h:507
FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
Definition sse2neon.h:7796
FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
Definition sse2neon.h:6209
#define vreinterpretq_m128i_s16(x)
Definition sse2neon.h:463
FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
Definition sse2neon.h:4807
#define vreinterpretq_f32_m128d(x)
Definition sse2neon.h:541
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition sse2neon.h:3089
FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
Definition sse2neon.h:3062
FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
Definition sse2neon.h:1660
FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
Definition sse2neon.h:7151
FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
Definition sse2neon.h:2720
#define vreinterpretq_m128d_s64(x)
Definition sse2neon.h:529
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition sse2neon.h:5046
FORCE_INLINE void _mm_setcsr(unsigned int a)
Definition sse2neon.h:2522
FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
Definition sse2neon.h:1042
float32x4_t __m128d
Definition sse2neon.h:413
FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
Definition sse2neon.h:2917
#define SSE2NEON_AES_RSBOX(w)
Definition sse2neon.h:8755
FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
Definition sse2neon.h:1108
FORCE_INLINE uint16_t _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
Definition sse2neon.h:8271
FORCE_INLINE int _mm_cmpestrc(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition sse2neon.h:8407
FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
Definition sse2neon.h:2347
#define vreinterpretq_s8_m128i(x)
Definition sse2neon.h:475
FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
Definition sse2neon.h:4509
FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
Definition sse2neon.h:4452
FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
Definition sse2neon.h:5697
FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
Definition sse2neon.h:770
FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
Definition sse2neon.h:3573
FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
Definition sse2neon.h:802
FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
Definition sse2neon.h:3956
FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
Definition sse2neon.h:4462
#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix)
Definition sse2neon.h:8234
FORCE_INLINE __m128d _mm_floor_pd(__m128d)
Definition sse2neon.h:7223
FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
Definition sse2neon.h:1405
FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
Definition sse2neon.h:6933
FORCE_INLINE uint64_t _rdtsc(void)
Definition sse2neon.h:9399
FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
Definition sse2neon.h:2959
FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
Definition sse2neon.h:3474
FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
Definition sse2neon.h:4240
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition sse2neon.h:2792
FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
Definition sse2neon.h:6426
FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
Definition sse2neon.h:3686
#define vreinterpretq_s64_m128d(x)
Definition sse2neon.h:536
FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
Definition sse2neon.h:5102
FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
Definition sse2neon.h:7059
FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
Definition sse2neon.h:6180
FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
Definition sse2neon.h:1077
FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
Definition sse2neon.h:4871
FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
Definition sse2neon.h:7069
FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
Definition sse2neon.h:6776
FORCE_INLINE __m128d _mm_load1_pd(const double *p)
Definition sse2neon.h:4369
#define _MM_DENORMALS_ZERO_ON
Definition sse2neon.h:393
FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
Definition sse2neon.h:2983
FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
Definition sse2neon.h:1731
FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
Definition sse2neon.h:818
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition sse2neon.h:2207
FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
Definition sse2neon.h:5067
#define vreinterpretq_m128i_u8(x)
Definition sse2neon.h:467
FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
Definition sse2neon.h:1520
FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
Definition sse2neon.h:4232
FORCE_INLINE __m128d _mm_round_pd(__m128d, int)
Definition sse2neon.h:7560
FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
Definition sse2neon.h:1601
#define _MM_ROUND_NEAREST
Definition sse2neon.h:383
FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
Definition sse2neon.h:1941
FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
Definition sse2neon.h:795
FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
Definition sse2neon.h:7740
FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
Definition sse2neon.h:9268
#define vreinterpret_u32_m64(x)
Definition sse2neon.h:501
FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
Definition sse2neon.h:4746
#define vreinterpretq_nth_u64_m128i(x, n)
Definition sse2neon.h:580
FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
Definition sse2neon.h:1469
FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
Definition sse2neon.h:3520
FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
Definition sse2neon.h:7368
FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
Definition sse2neon.h:7386
FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
Definition sse2neon.h:1611
#define vreinterpretq_m128d_f32(x)
Definition sse2neon.h:534
FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
Definition sse2neon.h:3651
FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
Definition sse2neon.h:4072
#define vreinterpretq_f64_m128i(x)
Definition sse2neon.h:473
FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
Definition sse2neon.h:1314
FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
Definition sse2neon.h:5910
FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
Definition sse2neon.h:4209
#define vreinterpret_m64_s16(x)
Definition sse2neon.h:486
FORCE_INLINE int _mm_movemask_pd(__m128d a)
Definition sse2neon.h:4736
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition sse2neon.h:2468
FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
Definition sse2neon.h:1841
#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)
Definition sse2neon.h:8357
FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
Definition sse2neon.h:3643
FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
Definition sse2neon.h:7350
FORCE_INLINE __m128d _mm_hsub_pd(__m128d a, __m128d b)
Definition sse2neon.h:6096
#define _SIDD_NEGATIVE_POLARITY
Definition sse2neon.h:7828
#define _MM_ROUND_DOWN
Definition sse2neon.h:384
FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
Definition sse2neon.h:7359
FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
Definition sse2neon.h:866
FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
Definition sse2neon.h:5053
FORCE_INLINE __m128 _mm_undefined_ps(void)
Definition sse2neon.h:2898
FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
Definition sse2neon.h:3767
FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
Definition sse2neon.h:6056
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition sse2neon.h:2513
FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
Definition sse2neon.h:5344
FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
Definition sse2neon.h:2846
FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
Definition sse2neon.h:4909
FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
Definition sse2neon.h:6988
FORCE_INLINE double sse2neon_recast_u64_f64(uint64_t val)
Definition sse2neon.h:131
FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
Definition sse2neon.h:5424
FORCE_INLINE __m128d _mm_set_sd(double a)
Definition sse2neon.h:5028
FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
Definition sse2neon.h:3924
FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
Definition sse2neon.h:6943
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition sse2neon.h:6589
FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
Definition sse2neon.h:6241
FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
Definition sse2neon.h:6024
FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
Definition sse2neon.h:1397
FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
Definition sse2neon.h:3834
FORCE_INLINE __m128i _mm_undefined_si128(void)
Definition sse2neon.h:2880
FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
Definition sse2neon.h:3818
FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
Definition sse2neon.h:4518
FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
Definition sse2neon.h:825
FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
Definition sse2neon.h:2103
FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
Definition sse2neon.h:3178
FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
Definition sse2neon.h:4774
#define _MM_FROUND_TO_NEG_INF
Definition sse2neon.h:371
FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
Definition sse2neon.h:7093
FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
Definition sse2neon.h:7754
FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
Definition sse2neon.h:5611
FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
Definition sse2neon.h:2772
FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
Definition sse2neon.h:9313
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition sse2neon.h:1296
FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
Definition sse2neon.h:1679
FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
Definition sse2neon.h:2067
FORCE_INLINE void _mm_empty(void)
Definition sse2neon.h:1195
FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
Definition sse2neon.h:5714
#define vreinterpretq_u64_m128(x)
Definition sse2neon.h:455
FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
Definition sse2neon.h:5650
FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
Definition sse2neon.h:5892
FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
Definition sse2neon.h:5675
FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t)
Definition sse2neon.h:8654
FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
Definition sse2neon.h:1360
FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
Definition sse2neon.h:7007
FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
Definition sse2neon.h:3581
FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
Definition sse2neon.h:5634
FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
Definition sse2neon.h:8314
FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
Definition sse2neon.h:1433
FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
Definition sse2neon.h:1716
FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
Definition sse2neon.h:1482
FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
Definition sse2neon.h:4889
FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
Definition sse2neon.h:3035
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition sse2neon.h:1961
FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
Definition sse2neon.h:6322
FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
Definition sse2neon.h:1530
FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
Definition sse2neon.h:4117
FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
Definition sse2neon.h:6012
FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
Definition sse2neon.h:6489
FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
Definition sse2neon.h:832
FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
Definition sse2neon.h:3241
#define vreinterpretq_s32_m128(x)
Definition sse2neon.h:459
FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
Definition sse2neon.h:2750
FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
Definition sse2neon.h:6577
FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
Definition sse2neon.h:1415
FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
Definition sse2neon.h:6472
FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
Definition sse2neon.h:3154
FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
Definition sse2neon.h:3616
FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
Definition sse2neon.h:5296
FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
Definition sse2neon.h:6660
FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
Definition sse2neon.h:645
FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
Definition sse2neon.h:6968
#define vreinterpretq_u32_m128i(x)
Definition sse2neon.h:482
FORCE_INLINE void * _mm_malloc(size_t size, size_t align)
Definition sse2neon.h:1989
FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
Definition sse2neon.h:6379
FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, signed char b1, signed char b2, signed char b3, signed char b4, signed char b5, signed char b6, signed char b7, signed char b8, signed char b9, signed char b10, signed char b11, signed char b12, signed char b13, signed char b14, signed char b15)
Definition sse2neon.h:5117
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition sse2neon.h:4361
FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
Definition sse2neon.h:1232
FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
Definition sse2neon.h:2153
FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
Definition sse2neon.h:3736
FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
Definition sse2neon.h:5705
FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
Definition sse2neon.h:7018
FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
Definition sse2neon.h:1115
FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
Definition sse2neon.h:2406
#define vreinterpretq_m128i_s64(x)
Definition sse2neon.h:465
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition sse2neon.h:2546
FORCE_INLINE int _sse2neon_clz(unsigned int x)
Definition sse2neon.h:8290
#define vreinterpretq_m128_s64(x)
Definition sse2neon.h:446
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition sse2neon.h:5642
FORCE_INLINE void _mm_free(void *addr)
Definition sse2neon.h:1807
#define vreinterpretq_u32_m128(x)
Definition sse2neon.h:454
#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)
Definition sse2neon.h:8351
FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
Definition sse2neon.h:4269
FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
Definition sse2neon.h:1951
FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
Definition sse2neon.h:4936
FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
Definition sse2neon.h:3802
FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
Definition sse2neon.h:7080
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition sse2neon.h:1223
FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
Definition sse2neon.h:9129
FORCE_INLINE int64_t sse2neon_recast_f64_s64(double val)
Definition sse2neon.h:137
FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
Definition sse2neon.h:3455
FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
Definition sse2neon.h:3397
FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
Definition sse2neon.h:6394
FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
Definition sse2neon.h:1540
FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
Definition sse2neon.h:5060
FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
Definition sse2neon.h:4633
#define _MM_ROUND_UP
Definition sse2neon.h:385
FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
Definition sse2neon.h:5322
FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
Definition sse2neon.h:1790
FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
Definition sse2neon.h:8524
int64x1_t __m64
Definition sse2neon.h:405
FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
Definition sse2neon.h:7377
FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
Definition sse2neon.h:4576
FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
Definition sse2neon.h:5412
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
Definition sse2neon.h:3190
FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
Definition sse2neon.h:3465
#define vreinterpret_s32_m64(x)
Definition sse2neon.h:506
FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
Definition sse2neon.h:8560
FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
Definition sse2neon.h:6619
#define vreinterpretq_m128d_u32(x)
Definition sse2neon.h:531
FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
Definition sse2neon.h:3098
FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
Definition sse2neon.h:4194
#define vreinterpret_m64_u64(x)
Definition sse2neon.h:493
FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
Definition sse2neon.h:2029
#define __int64
Definition sse2neon.h:428
FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
Definition sse2neon.h:848
FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
Definition sse2neon.h:3323
FORCE_INLINE __m128 _mm_floor_ps(__m128)
Definition sse2neon.h:7239
FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
Definition sse2neon.h:3350
#define vreinterpretq_u64_m128i(x)
Definition sse2neon.h:483
FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
Definition sse2neon.h:4173
FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
Definition sse2neon.h:7030
FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
Definition sse2neon.h:4395
FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
Definition sse2neon.h:9369
FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
Definition sse2neon.h:6887
FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t)
Definition sse2neon.h:4974
#define _MM_ROUND_TOWARD_ZERO
Definition sse2neon.h:386
FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
Definition sse2neon.h:809
FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
Definition sse2neon.h:1260
FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
Definition sse2neon.h:9075
FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void)
Definition sse2neon.h:1865
FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
Definition sse2neon.h:5775
FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
Definition sse2neon.h:4487
FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
Definition sse2neon.h:1342
FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
Definition sse2neon.h:7323
FORCE_INLINE int _mm_cmpestra(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition sse2neon.h:8393
#define SSE2NEON_CMPESTR_LIST
Definition sse2neon.h:8241
FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
Definition sse2neon.h:5793
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition sse2neon.h:1202
FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
Definition sse2neon.h:1828
FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
Definition sse2neon.h:1461
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition sse2neon.h:1332
FORCE_INLINE int _mm_movemask_ps(__m128 a)
Definition sse2neon.h:2184
FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
Definition sse2neon.h:7255
FORCE_INLINE __m128i _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
Definition sse2neon.h:5087
FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i)
Definition sse2neon.h:4880
FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
Definition sse2neon.h:3538
FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
Definition sse2neon.h:9290
FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
Definition sse2neon.h:9037
FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
Definition sse2neon.h:6997
FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
Definition sse2neon.h:7050
#define _mm_set_pd1
Definition sse2neon.h:5023
FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
Definition sse2neon.h:4500
FORCE_INLINE void _mm_stream_si32(int *p, int a)
Definition sse2neon.h:5688
FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
Definition sse2neon.h:7341
FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
Definition sse2neon.h:6410
FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
Definition sse2neon.h:3678
FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
Definition sse2neon.h:8915
FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
Definition sse2neon.h:3044
FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
Definition sse2neon.h:4057
FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
Definition sse2neon.h:5603
FORCE_INLINE __m128d _mm_set1_pd(double d)
Definition sse2neon.h:5075
#define _MM_FROUND_TO_NEAREST_INT
Definition sse2neon.h:370
FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
Definition sse2neon.h:2933
FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
Definition sse2neon.h:1695
#define vreinterpretq_m128d_u64(x)
Definition sse2neon.h:532
FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
Definition sse2neon.h:4149
FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
Definition sse2neon.h:1324
FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
Definition sse2neon.h:5742
FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
Definition sse2neon.h:8636
#define SSE2NEON_AES_U3(p)
#define vreinterpret_s16_m64(x)
Definition sse2neon.h:505
FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
Definition sse2neon.h:6201
FORCE_INLINE __m128i _mm_set_epi16(short i7, short i6, short i5, short i4, short i3, short i2, short i1, short i0)
Definition sse2neon.h:4944
FORCE_INLINE __m128d _mm_load_sd(const double *p)
Definition sse2neon.h:4347
FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
Definition sse2neon.h:1379
FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
Definition sse2neon.h:3940
#define vreinterpretq_m128i_u32(x)
Definition sse2neon.h:469
int16_t ALIGN_STRUCT(1) unaligned_int16_t
Definition sse2neon.h:418
FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
Definition sse2neon.h:5811
FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
Definition sse2neon.h:2077
FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
Definition sse2neon.h:1095
FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
Definition sse2neon.h:8533
FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b)
Definition sse2neon.h:2137
#define SSE2NEON_AES_U1(p)
FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
Definition sse2neon.h:5768
FORCE_INLINE __m128i _mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition sse2neon.h:8434
FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
Definition sse2neon.h:3118
FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
Definition sse2neon.h:1069
FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
Definition sse2neon.h:1212
FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
Definition sse2neon.h:2814
FORCE_INLINE __m128 _mm_ceil_ps(__m128)
Definition sse2neon.h:6917
FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
Definition sse2neon.h:7445
#define vreinterpretq_m128i_s32(x)
Definition sse2neon.h:464
FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
Definition sse2neon.h:7540
FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
Definition sse2neon.h:6842
FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
Definition sse2neon.h:8818
FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
Definition sse2neon.h:786
FORCE_INLINE __m128 _mm_set_ss(float a)
Definition sse2neon.h:2505
FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
Definition sse2neon.h:884
FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
Definition sse2neon.h:4862
FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
Definition sse2neon.h:875
FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
Definition sse2neon.h:9342
FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
Definition sse2neon.h:5784
FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
Definition sse2neon.h:7395
#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix)
Definition sse2neon.h:8058
FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
Definition sse2neon.h:3879
FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
Definition sse2neon.h:5385
FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
Definition sse2neon.h:8551
#define SSE2NEON_BARRIER()
Definition sse2neon.h:208
FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
Definition sse2neon.h:1931
FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
Definition sse2neon.h:1306
FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
Definition sse2neon.h:2363
FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
Definition sse2neon.h:3146
FORCE_INLINE __m128 _mm_move_ss(__m128, __m128)
Definition sse2neon.h:2126
FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
Definition sse2neon.h:4220
FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
Definition sse2neon.h:6632
FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
Definition sse2neon.h:3445
#define vreinterpretq_f32_m128(x)
Definition sse2neon.h:449
FORCE_INLINE __m128 _mm_round_ps(__m128, int)
Definition sse2neon.h:7629
FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
Definition sse2neon.h:840
FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
Definition sse2neon.h:4763
FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
Definition sse2neon.h:3546
FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
Definition sse2neon.h:2226
FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
Definition sse2neon.h:7265
FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
Definition sse2neon.h:4967
FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
Definition sse2neon.h:5537
FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
Definition sse2neon.h:4838
FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
Definition sse2neon.h:1350
FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
Definition sse2neon.h:5624
FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
Definition sse2neon.h:8515
FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
Definition sse2neon.h:1085
FORCE_INLINE __m128d _mm_undefined_pd(void)
Definition sse2neon.h:5826
#define _MM_FROUND_TO_ZERO
Definition sse2neon.h:373
FORCE_INLINE __m128d _mm_set_pd(double, double)
Definition sse2neon.h:5010
#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix)
Definition sse2neon.h:8223
FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
Definition sse2neon.h:3162
FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
Definition sse2neon.h:858
FORCE_INLINE __m128 _mm_load1_ps(const float *p)
Definition sse2neon.h:1922
FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
Definition sse2neon.h:7039
FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
Definition sse2neon.h:6977
FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
Definition sse2neon.h:6120
FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
Definition sse2neon.h:1624
#define _MM_FLUSH_ZERO_ON
Definition sse2neon.h:389
FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
Definition sse2neon.h:8583
FORCE_INLINE void _mm_prefetch(char const *p, int i)
Definition sse2neon.h:2295
#define _MM_DENORMALS_ZERO_MASK
Definition sse2neon.h:392
FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
Definition sse2neon.h:1669
FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
Definition sse2neon.h:7776
FORCE_INLINE __m128i _mm_castps_si128(__m128)
Definition sse2neon.h:3170
FORCE_INLINE void _mm_lfence(void)
Definition sse2neon.h:2603
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition sse2neon.h:1894
FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
Definition sse2neon.h:4616
#define vreinterpretq_m128i_u16(x)
Definition sse2neon.h:468
FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
Definition sse2neon.h:1644
_mm_hint
Definition sse2neon.h:738
@ _MM_HINT_T1
Definition sse2neon.h:741
@ _MM_HINT_T0
Definition sse2neon.h:740
@ _MM_HINT_T2
Definition sse2neon.h:742
@ _MM_HINT_NTA
Definition sse2neon.h:739
#define _MM_FROUND_CUR_DIRECTION
Definition sse2neon.h:374
int64x2_t __m128i
Definition sse2neon.h:415
FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
Definition sse2neon.h:3127
FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
Definition sse2neon.h:7710
#define SSE2NEON_AES_SBOX(w)
Definition sse2neon.h:8715
FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
Definition sse2neon.h:2372
FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
Definition sse2neon.h:5566
#define vreinterpretq_u16_m128i(x)
Definition sse2neon.h:481
FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
Definition sse2neon.h:4094
FORCE_INLINE int _mm_movemask_pi8(__m64 a)
Definition sse2neon.h:2163
FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
Definition sse2neon.h:1101
FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
Definition sse2neon.h:1740
FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
Definition sse2neon.h:3276
FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
Definition sse2neon.h:5660
FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
Definition sse2neon.h:2992
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition sse2neon.h:2732
FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
Definition sse2neon.h:4250
FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
Definition sse2neon.h:5957
FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
Definition sse2neon.h:5367
FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
Definition sse2neon.h:6338
FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
Definition sse2neon.h:2478
FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
Definition sse2neon.h:4382
FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
Definition sse2neon.h:5732
FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
Definition sse2neon.h:5309
#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8)
Definition sse2neon.h:8485
FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
Definition sse2neon.h:4550
FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
Definition sse2neon.h:1576
#define vreinterpret_s8_m64(x)
Definition sse2neon.h:504
FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
Definition sse2neon.h:1817
#define _mm_srli_si128(a, imm)
Definition sse2neon.h:5525
#define _mm_shuffle_epi32(a, imm)
Definition sse2neon.h:5183
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition sse2neon.h:2687
FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
Definition sse2neon.h:5283
#define vreinterpret_u8_m64(x)
Definition sse2neon.h:499
FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
Definition sse2neon.h:2764
FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
Definition sse2neon.h:5925
FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
Definition sse2neon.h:1510
FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
Definition sse2neon.h:6718
FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
Definition sse2neon.h:1288
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
Definition sse2neon.h:2088
FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
Definition sse2neon.h:6147
FORCE_INLINE void _mm_pause(void)
Definition sse2neon.h:4922
FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
Definition sse2neon.h:1978
FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
Definition sse2neon.h:6003
FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
Definition sse2neon.h:7719
FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
Definition sse2neon.h:5550
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition sse2neon.h:3894
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition sse2neon.h:2235
FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
Definition sse2neon.h:6869
FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
Definition sse2neon.h:5988
FORCE_INLINE int _sse2neon_ctz(unsigned int x)
Definition sse2neon.h:8302
FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
Definition sse2neon.h:2115
FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
Definition sse2neon.h:3423
FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
Definition sse2neon.h:3608
#define SSE2NEON_AES_H0(x)
Definition sse2neon.h:8798
FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
Definition sse2neon.h:6233
FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
Definition sse2neon.h:5475
FORCE_INLINE int _mm_cmpestro(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition sse2neon.h:8443
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
Definition sse2neon.h:2040
FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
Definition sse2neon.h:6347
FORCE_INLINE void _mm_mfence(void)
Definition sse2neon.h:2593
FORCE_INLINE float _mm_cvtss_f32(__m128 a)
Definition sse2neon.h:1703
FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
Definition sse2neon.h:1445
#define _MM_FROUND_NO_EXC
Definition sse2neon.h:375
#define vreinterpretq_s64_m128i(x)
Definition sse2neon.h:478
FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
Definition sse2neon.h:4110
#define _MM_FLUSH_ZERO_OFF
Definition sse2neon.h:390
FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
Definition sse2neon.h:6528
SIMDVec
Definition sse2neon.h:577
FORCE_INLINE void _mm_clflush(void const *p)
Definition sse2neon.h:3201
Definition sse2neon.h:746
uint16_t res0
Definition sse2neon.h:747
uint8_t bit22
Definition sse2neon.h:749
uint8_t res1
Definition sse2neon.h:748
uint8_t bit23
Definition sse2neon.h:750
uint8_t res2
Definition sse2neon.h:752
uint8_t bit24
Definition sse2neon.h:751