libvmx
VMX Codec
Loading...
Searching...
No Matches
sse2neon.h
Go to the documentation of this file.
1#ifndef SSE2NEON_H
2#define SSE2NEON_H
3
4/*
5 * sse2neon is freely redistributable under the MIT License.
6 *
7 * Copyright (c) 2015-2024 SSE2NEON Contributors.
8 *
9 * Permission is hereby granted, free of charge, to any person obtaining a copy
10 * of this software and associated documentation files (the "Software"), to deal
11 * in the Software without restriction, including without limitation the rights
12 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 * copies of the Software, and to permit persons to whom the Software is
14 * furnished to do so, subject to the following conditions:
15 *
16 * The above copyright notice and this permission notice shall be included in
17 * all copies or substantial portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 * SOFTWARE.
26 */
27
28// This header file provides a simple API translation layer
29// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
30//
31// Contributors to this work are:
32// John W. Ratcliff <jratcliffscarab@gmail.com>
33// Brandon Rowlett <browlett@nvidia.com>
34// Ken Fast <kfast@gdeb.com>
35// Eric van Beurden <evanbeurden@nvidia.com>
36// Alexander Potylitsin <apotylitsin@nvidia.com>
37// Hasindu Gamaarachchi <hasindu2008@gmail.com>
38// Jim Huang <jserv@ccns.ncku.edu.tw>
39// Mark Cheng <marktwtn@gmail.com>
40// Malcolm James MacLeod <malcolm@gulden.com>
41// Devin Hussey (easyaspi314) <husseydevin@gmail.com>
42// Sebastian Pop <spop@amazon.com>
43// Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
44// Danila Kutenin <danilak@google.com>
45// François Turban (JishinMaster) <francois.turban@gmail.com>
46// Pei-Hsuan Hung <afcidk@gmail.com>
47// Yang-Hao Yuan <yuanyanghau@gmail.com>
48// Syoyo Fujita <syoyo@lighttransport.com>
49// Brecht Van Lommel <brecht@blender.org>
50// Jonathan Hue <jhue@adobe.com>
51// Cuda Chen <clh960524@gmail.com>
52// Aymen Qader <aymen.qader@arm.com>
53// Anthony Roberts <anthony.roberts@linaro.org>
54// Sean Luchen <seanluchen@google.com>
55
56/* Tunable configurations */
57
58/* Enable precise implementation of math operations
59 * This would slow down the computation a bit, but gives consistent result with
60 * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
61 */
62/* _mm_min|max_ps|ss|pd|sd */
63#ifndef SSE2NEON_PRECISE_MINMAX
64#define SSE2NEON_PRECISE_MINMAX (0)
65#endif
66/* _mm_rcp_ps */
67#ifndef SSE2NEON_PRECISE_DIV
68#define SSE2NEON_PRECISE_DIV (0)
69#endif
70/* _mm_sqrt_ps and _mm_rsqrt_ps */
71#ifndef SSE2NEON_PRECISE_SQRT
72#define SSE2NEON_PRECISE_SQRT (0)
73#endif
74/* _mm_dp_pd */
75#ifndef SSE2NEON_PRECISE_DP
76#define SSE2NEON_PRECISE_DP (0)
77#endif
78
79/* Enable inclusion of windows.h on MSVC platforms
80 * This makes _mm_clflush functional on windows, as there is no builtin.
81 */
82#ifndef SSE2NEON_INCLUDE_WINDOWS_H
83#define SSE2NEON_INCLUDE_WINDOWS_H (0)
84#endif
85
86/* compiler specific definitions */
87#if defined(__GNUC__) || defined(__clang__)
88#pragma push_macro("FORCE_INLINE")
89#pragma push_macro("ALIGN_STRUCT")
90#define FORCE_INLINE static inline __attribute__((always_inline))
91#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
92#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
93#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
94#elif defined(_MSC_VER)
95#if _MSVC_TRADITIONAL
96#error Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead.
97#endif
98#ifndef FORCE_INLINE
99#define FORCE_INLINE static inline
100#endif
101#ifndef ALIGN_STRUCT
102#define ALIGN_STRUCT(x) __declspec(align(x))
103#endif
104#define _sse2neon_likely(x) (x)
105#define _sse2neon_unlikely(x) (x)
106#else
107#pragma message("Macro name collisions may happen with unsupported compilers.")
108#endif
109
110#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10
111#warning "GCC versions earlier than 10 are not supported."
112#endif
113
114#if defined(__OPTIMIZE__) && !defined(SSE2NEON_SUPPRESS_WARNINGS)
115#warning \
116 "Report any potential compiler optimization issues when using SSE2NEON. See the 'Optimization' section at https://github.com/DLTcollab/sse2neon."
117#endif
118
119/* C language does not allow initializing a variable with a function call. */
120#ifdef __cplusplus
121#define _sse2neon_const static const
122#else
123#define _sse2neon_const const
124#endif
125
126#include <fenv.h>
127#include <stdint.h>
128#include <stdlib.h>
129#include <string.h>
130
131FORCE_INLINE double sse2neon_recast_u64_f64(uint64_t val)
132{
133 double tmp;
134 memcpy(&tmp, &val, sizeof(uint64_t));
135 return tmp;
136}
137FORCE_INLINE int64_t sse2neon_recast_f64_s64(double val)
138{
139 int64_t tmp;
140 memcpy(&tmp, &val, sizeof(uint64_t));
141 return tmp;
142}
143
144#if defined(_WIN32) && !defined(__MINGW32__)
145/* Definitions for _mm_{malloc,free} are provided by <malloc.h> from MSVC. */
146#define SSE2NEON_ALLOC_DEFINED
147#endif
148
149/* If using MSVC */
150#ifdef _MSC_VER
151#if defined(_M_ARM64EC)
152#define _DISABLE_SOFTINTRIN_ 1
153#endif
154#include <intrin.h>
155#if SSE2NEON_INCLUDE_WINDOWS_H
156#include <processthreadsapi.h>
157#include <windows.h>
158#endif
159
160#if !defined(__cplusplus)
161#error SSE2NEON only supports C++ compilation with this compiler
162#endif
163
164#ifdef SSE2NEON_ALLOC_DEFINED
165#include <malloc.h>
166#endif
167
168#if (defined(_M_AMD64) || defined(__x86_64__)) || \
169 (defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__arm64__))
170#define SSE2NEON_HAS_BITSCAN64
171#endif
172#endif
173
174#if defined(__GNUC__) || defined(__clang__)
175#define _sse2neon_define0(type, s, body) \
176 __extension__({ \
177 type _a = (s); \
178 body \
179 })
180#define _sse2neon_define1(type, s, body) \
181 __extension__({ \
182 type _a = (s); \
183 body \
184 })
185#define _sse2neon_define2(type, a, b, body) \
186 __extension__({ \
187 type _a = (a), _b = (b); \
188 body \
189 })
190#define _sse2neon_return(ret) (ret)
191#else
192#define _sse2neon_define0(type, a, body) [=](type _a) { body }(a)
193#define _sse2neon_define1(type, a, body) [](type _a) { body }(a)
194#define _sse2neon_define2(type, a, b, body) \
195 [](type _a, type _b) { body }((a), (b))
196#define _sse2neon_return(ret) return ret
197#endif
198
199#define _sse2neon_init(...) \
200 { \
201 __VA_ARGS__ \
202 }
203
204/* Compiler barrier */
205#if defined(_MSC_VER) && !defined(__clang__)
206#define SSE2NEON_BARRIER() _ReadWriteBarrier()
207#else
208#define SSE2NEON_BARRIER() \
209 do { \
210 __asm__ __volatile__("" ::: "memory"); \
211 (void) 0; \
212 } while (0)
213#endif
214
215/* Memory barriers
216 * __atomic_thread_fence does not include a compiler barrier; instead,
217 * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
218 * semantics.
219 */
220#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
221#include <stdatomic.h>
222#endif
223
224FORCE_INLINE void _sse2neon_smp_mb(void)
225{
227#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
228 !defined(__STDC_NO_ATOMICS__)
229 atomic_thread_fence(memory_order_seq_cst);
230#elif defined(__GNUC__) || defined(__clang__)
231 __atomic_thread_fence(__ATOMIC_SEQ_CST);
232#else /* MSVC */
233 __dmb(_ARM64_BARRIER_ISH);
234#endif
235}
236
237/* Architecture-specific build options */
238/* FIXME: #pragma GCC push_options is only available on GCC */
239#if defined(__GNUC__)
240#if defined(__arm__) && __ARM_ARCH == 7
241/* According to ARM C Language Extensions Architecture specification,
242 * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
243 * architecture supported.
244 */
245#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
246#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
247#endif
248#if !defined(__clang__)
249#pragma GCC push_options
250#pragma GCC target("fpu=neon")
251#endif
252#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
253#if !defined(__clang__) && !defined(_MSC_VER)
254#pragma GCC push_options
255#pragma GCC target("+simd")
256#endif
257#elif __ARM_ARCH == 8
258#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
259#error \
260 "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
261#endif
262#if !defined(__clang__) && !defined(_MSC_VER)
263#pragma GCC push_options
264#endif
265#else
266#error \
267 "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A \
268(you could try setting target explicitly with -march or -mcpu)"
269#endif
270#endif
271
272#include <arm_neon.h>
273#if (!defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)) && \
274 (__ARM_ARCH == 8)
275#if defined __has_include && __has_include(<arm_acle.h>)
276#include <arm_acle.h>
277#endif
278#endif
279
280/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD
281 * and other Arm microarchitectures use.
282 * From sysctl -a on Apple M1:
283 * hw.cachelinesize: 128
284 */
285#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
286#define SSE2NEON_CACHELINE_SIZE 128
287#else
288#define SSE2NEON_CACHELINE_SIZE 64
289#endif
290
291/* Rounding functions require either Aarch64 instructions or libm fallback */
292#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
293#include <math.h>
294#endif
295
296/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
297 * or even not accessible in user mode.
298 * To write or access to these registers in user mode,
299 * we have to perform syscall instead.
300 */
301#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
302#include <sys/time.h>
303#endif
304
305/* "__has_builtin" can be used to query support for built-in functions
306 * provided by gcc/clang and other compilers that support it.
307 */
308#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
309/* Compatibility with gcc <= 9 */
310#if defined(__GNUC__) && (__GNUC__ <= 9)
311#define __has_builtin(x) HAS##x
312#define HAS__builtin_popcount 1
313#define HAS__builtin_popcountll 1
314
315// __builtin_shuffle introduced in GCC 4.7.0
316#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
317#define HAS__builtin_shuffle 1
318#else
319#define HAS__builtin_shuffle 0
320#endif
321
322#define HAS__builtin_shufflevector 0
323#define HAS__builtin_nontemporal_store 0
324#else
325#define __has_builtin(x) 0
326#endif
327#endif
328
337#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
338 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
339
347#define _MM_SHUFFLE2(fp1, fp0) (((fp1) << 1) | (fp0))
348
349#if __has_builtin(__builtin_shufflevector)
350#define _sse2neon_shuffle(type, a, b, ...) \
351 __builtin_shufflevector(a, b, __VA_ARGS__)
352#elif __has_builtin(__builtin_shuffle)
353#define _sse2neon_shuffle(type, a, b, ...) \
354 __extension__({ \
355 type tmp = {__VA_ARGS__}; \
356 __builtin_shuffle(a, b, tmp); \
357 })
358#endif
359
360#ifdef _sse2neon_shuffle
361#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
362#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
363#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
364#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
365#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
366#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
367#endif
368
369/* Rounding mode macros. */
370#define _MM_FROUND_TO_NEAREST_INT 0x00
371#define _MM_FROUND_TO_NEG_INF 0x01
372#define _MM_FROUND_TO_POS_INF 0x02
373#define _MM_FROUND_TO_ZERO 0x03
374#define _MM_FROUND_CUR_DIRECTION 0x04
375#define _MM_FROUND_NO_EXC 0x08
376#define _MM_FROUND_RAISE_EXC 0x00
377#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
378#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
379#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
380#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
381#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
382#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
383#define _MM_ROUND_NEAREST 0x0000
384#define _MM_ROUND_DOWN 0x2000
385#define _MM_ROUND_UP 0x4000
386#define _MM_ROUND_TOWARD_ZERO 0x6000
387/* Flush zero mode macros. */
388#define _MM_FLUSH_ZERO_MASK 0x8000
389#define _MM_FLUSH_ZERO_ON 0x8000
390#define _MM_FLUSH_ZERO_OFF 0x0000
391/* Denormals are zeros mode macros. */
392#define _MM_DENORMALS_ZERO_MASK 0x0040
393#define _MM_DENORMALS_ZERO_ON 0x0040
394#define _MM_DENORMALS_ZERO_OFF 0x0000
395
396/* indicate immediate constant argument in a given range */
397#define __constrange(a, b) const
398
399/* A few intrinsics accept traditional data types like ints or floats, but
400 * most operate on data types that are specific to SSE.
401 * If a vector type ends in d, it contains doubles, and if it does not have
402 * a suffix, it contains floats. An integer vector type can contain any type
403 * of integer, from chars to shorts to unsigned long longs.
404 */
405typedef int64x1_t __m64;
406typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
407// On ARM 32-bit architecture, the float64x2_t is not supported.
408// The data type __m128d should be represented in a different way for related
409// intrinsic conversion.
410#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
411typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
412#else
413typedef float32x4_t __m128d;
414#endif
415typedef int64x2_t __m128i; /* 128-bit vector containing integers */
416
417// Some intrinsics operate on unaligned data types.
418typedef int16_t ALIGN_STRUCT(1) unaligned_int16_t;
419typedef int32_t ALIGN_STRUCT(1) unaligned_int32_t;
420typedef int64_t ALIGN_STRUCT(1) unaligned_int64_t;
421
422// __int64 is defined in the Intrinsics Guide which maps to different datatype
423// in different data model
424#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
425#if (defined(__x86_64__) || defined(__i386__))
426#define __int64 long long
427#else
428#define __int64 int64_t
429#endif
430#endif
431
432/* type-safe casting between types */
433
434#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
435#define vreinterpretq_m128_f32(x) (x)
436#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
437
438#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
439#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
440#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
441#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
442
443#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
444#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
445#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
446#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
447
448#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
449#define vreinterpretq_f32_m128(x) (x)
450#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
451
452#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
453#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
454#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
455#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
456
457#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
458#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
459#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
460#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
461
462#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
463#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
464#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
465#define vreinterpretq_m128i_s64(x) (x)
466
467#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
468#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
469#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
470#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
471
472#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
473#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
474
475#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
476#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
477#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
478#define vreinterpretq_s64_m128i(x) (x)
479
480#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
481#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
482#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
483#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
484
485#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
486#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
487#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
488#define vreinterpret_m64_s64(x) (x)
489
490#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
491#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
492#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
493#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
494
495#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
496#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
497#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
498
499#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
500#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
501#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
502#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
503
504#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
505#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
506#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
507#define vreinterpret_s64_m64(x) (x)
508
509#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
510
511#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
512#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
513#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
514
515#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
516
517#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
518#define vreinterpretq_m128d_f64(x) (x)
519
520#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
521
522#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
523#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
524
525#define vreinterpretq_f64_m128d(x) (x)
526#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
527#else
528#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
529#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
530
531#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
532#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
533
534#define vreinterpretq_m128d_f32(x) (x)
535
536#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
537
538#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
539#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
540
541#define vreinterpretq_f32_m128d(x) (x)
542#endif
543
544// A struct is defined in this header file called 'SIMDVec' which can be used
545// by applications which attempt to access the contents of an __m128 struct
546// directly. It is important to note that accessing the __m128 struct directly
547// is bad coding practice by Microsoft: @see:
548// https://learn.microsoft.com/en-us/cpp/cpp/m128
549//
550// However, some legacy source code may try to access the contents of an __m128
551// struct directly so the developer can use the SIMDVec as an alias for it. Any
552// casting must be done manually by the developer, as you cannot cast or
553// otherwise alias the base NEON data type for intrinsic operations.
554//
555// union intended to allow direct access to an __m128 variable using the names
556// that the MSVC compiler provides. This union should really only be used when
557// trying to access the members of the vector as integer values. GCC/clang
558// allow native access to the float members through a simple array access
559// operator (in C since 4.6, in C++ since 4.8).
560//
561// Ideally direct accesses to SIMD vectors should not be used since it can cause
562// a performance hit. If it really is needed however, the original __m128
563// variable can be aliased with a pointer to this union and used to access
564// individual components. The use of this union should be hidden behind a macro
565// that is used throughout the codebase to access the members instead of always
566// declaring this type of variable.
567typedef union ALIGN_STRUCT(16) SIMDVec {
568 float m128_f32[4]; // as floats - DON'T USE. Added for convenience.
569 int8_t m128_i8[16]; // as signed 8-bit integers.
570 int16_t m128_i16[8]; // as signed 16-bit integers.
571 int32_t m128_i32[4]; // as signed 32-bit integers.
572 int64_t m128_i64[2]; // as signed 64-bit integers.
573 uint8_t m128_u8[16]; // as unsigned 8-bit integers.
574 uint16_t m128_u16[8]; // as unsigned 16-bit integers.
575 uint32_t m128_u32[4]; // as unsigned 32-bit integers.
576 uint64_t m128_u64[2]; // as unsigned 64-bit integers.
578
579// casting using SIMDVec
580#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
581#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
582#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
583
584/* SSE macros */
585#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
586#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
587#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
588#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
589
590// Function declaration
591// SSE
592FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void);
593FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
594FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
595FORCE_INLINE __m128 _mm_set_ps1(float);
596FORCE_INLINE __m128 _mm_setzero_ps(void);
597// SSE2
598FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
599FORCE_INLINE __m128i _mm_castps_si128(__m128);
601FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
602FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
603FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
604FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
605FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
606FORCE_INLINE __m128d _mm_set_pd(double, double);
607FORCE_INLINE __m128i _mm_set1_epi32(int);
608FORCE_INLINE __m128i _mm_setzero_si128(void);
609// SSE4.1
610FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
611FORCE_INLINE __m128 _mm_ceil_ps(__m128);
612FORCE_INLINE __m128d _mm_floor_pd(__m128d);
613FORCE_INLINE __m128 _mm_floor_ps(__m128);
614FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
615FORCE_INLINE __m128 _mm_round_ps(__m128, int);
616// SSE4.2
617FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
618
619/* Backwards compatibility for compilers with lack of specific type support */
620
621// Older gcc does not define vld1q_u8_x4 type
622#if defined(__GNUC__) && !defined(__clang__) && \
623 ((__GNUC__ <= 13 && defined(__arm__)) || \
624 (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
625 (__GNUC__ <= 9 && defined(__aarch64__)))
626FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
627{
628 uint8x16x4_t ret;
629 ret.val[0] = vld1q_u8(p + 0);
630 ret.val[1] = vld1q_u8(p + 16);
631 ret.val[2] = vld1q_u8(p + 32);
632 ret.val[3] = vld1q_u8(p + 48);
633 return ret;
634}
635#else
636// Wraps vld1q_u8_x4
637FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
638{
639 return vld1q_u8_x4(p);
640}
641#endif
642
643#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
644/* emulate vaddv u8 variant */
645FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
646{
647 const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
648 return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
649}
650#else
651// Wraps vaddv_u8
652FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
653{
654 return vaddv_u8(v8);
655}
656#endif
657
658#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
659/* emulate vaddvq u8 variant */
660FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
661{
662 uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
663 uint8_t res = 0;
664 for (int i = 0; i < 8; ++i)
665 res += tmp[i];
666 return res;
667}
668#else
669// Wraps vaddvq_u8
670FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
671{
672 return vaddvq_u8(a);
673}
674#endif
675
676#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
677/* emulate vaddvq u16 variant */
678FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
679{
680 uint32x4_t m = vpaddlq_u16(a);
681 uint64x2_t n = vpaddlq_u32(m);
682 uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
683
684 return vget_lane_u32((uint32x2_t) o, 0);
685}
686#else
687// Wraps vaddvq_u16
688FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
689{
690 return vaddvq_u16(a);
691}
692#endif
693
694/* Function Naming Conventions
695 * The naming convention of SSE intrinsics is straightforward. A generic SSE
696 * intrinsic function is given as follows:
697 * _mm_<name>_<data_type>
698 *
699 * The parts of this format are given as follows:
700 * 1. <name> describes the operation performed by the intrinsic
701 * 2. <data_type> identifies the data type of the function's primary arguments
702 *
703 * This last part, <data_type>, is a little complicated. It identifies the
704 * content of the input values, and can be set to any of the following values:
705 * + ps - vectors contain floats (ps stands for packed single-precision)
706 * + pd - vectors contain doubles (pd stands for packed double-precision)
707 * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
708 * signed integers
709 * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
710 * unsigned integers
711 * + si128 - unspecified 128-bit vector or 256-bit vector
712 * + m128/m128i/m128d - identifies input vector types when they are different
713 * than the type of the returned vector
714 *
715 * For example, _mm_setzero_ps. The _mm implies that the function returns
716 * a 128-bit vector. The _ps at the end implies that the argument vectors
717 * contain floats.
718 *
719 * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
720 * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
721 * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
722 * // Set packed 8-bit integers
723 * // 128 bits, 16 chars, per 8 bits
724 * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11,
725 * 4, 5, 12, 13, 6, 7, 14, 15);
726 * // Shuffle packed 8-bit integers
727 * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
728 */
729
730/* Constants for use with _mm_prefetch. */
731#if defined(_M_ARM64EC)
732/* winnt.h already defines these constants as macros, so undefine them first. */
733#undef _MM_HINT_NTA
734#undef _MM_HINT_T0
735#undef _MM_HINT_T1
736#undef _MM_HINT_T2
737#endif
739 _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
740 _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */
741 _MM_HINT_T1 = 2, /* load data to L2 cache only */
742 _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */
743};
744
745// The bit field mapping to the FPCR(floating-point control register)
746typedef struct {
747 uint16_t res0;
748 uint8_t res1 : 6;
749 uint8_t bit22 : 1;
750 uint8_t bit23 : 1;
751 uint8_t bit24 : 1;
752 uint8_t res2 : 7;
753#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
754 uint32_t res3;
755#endif
757
758// Takes the upper 64 bits of a and places it in the low end of the result
759// Takes the lower 64 bits of b and places it into the high end of the result.
761{
762 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
763 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
764 return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
765}
766
767// takes the lower two 32-bit values from a and swaps them and places in high
768// end of result takes the higher two 32 bit values from b and swaps them and
769// places in low end of result.
771{
772 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
773 float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
774 return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
775}
776
778{
779 float32x2_t a21 = vget_high_f32(
781 float32x2_t b03 = vget_low_f32(
783 return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
784}
785
787{
788 float32x2_t a03 = vget_low_f32(
790 float32x2_t b21 = vget_high_f32(
792 return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
793}
794
796{
797 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
798 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
799 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
800}
801
803{
804 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
805 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
806 return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
807}
808
810{
811 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
812 float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
813 return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
814}
815
816// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
817// high
819{
820 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
821 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
822 return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
823}
824
826{
827 float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
828 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
829 return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
830}
831
833{
834 float32x2_t a22 =
835 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
836 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
837 return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
838}
839
841{
842 float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
843 float32x2_t b22 =
844 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
845 return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
846}
847
849{
850 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
851 float32x2_t a22 =
852 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
853 float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
854 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
855 return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
856}
857
859{
860 float32x2_t a33 =
861 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
862 float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
863 return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
864}
865
867{
868 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
869 float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
870 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
871 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
872 return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
873}
874
876{
877 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
878 float32_t b2 = vgetq_lane_f32(b, 2);
879 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
880 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
881 return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
882}
883
885{
886 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
887 float32_t b2 = vgetq_lane_f32(b, 2);
888 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
889 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
890 return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
891}
892
893// For MSVC, we check only if it is ARM64, as every single ARM64 processor
894// supported by WoA has crypto extensions. If this changes in the future,
895// this can be verified via the runtime-only method of:
896// IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)
897#if ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__)) || \
898 (defined(__ARM_FEATURE_CRYPTO) && \
899 (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64)))
900// Wraps vmull_p64
901FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
902{
903 poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
904 poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
905#if defined(_MSC_VER) && !defined(__clang__)
906 __n64 a1 = {a}, b1 = {b};
907 return vreinterpretq_u64_p128(vmull_p64(a1, b1));
908#else
909 return vreinterpretq_u64_p128(vmull_p64(a, b));
910#endif
911}
912#else // ARMv7 polyfill
913// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
914//
915// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
916// 64-bit->128-bit polynomial multiply.
917//
918// It needs some work and is somewhat slow, but it is still faster than all
919// known scalar methods.
920//
921// Algorithm adapted to C from
922// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
923// from "Fast Software Polynomial Multiplication on ARM Processors Using the
924// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
925// (https://hal.inria.fr/hal-01506572)
926static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
927{
928 poly8x8_t a = vreinterpret_p8_u64(_a);
929 poly8x8_t b = vreinterpret_p8_u64(_b);
930
931 // Masks
932 uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
933 vcreate_u8(0x00000000ffffffff));
934 uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
935 vcreate_u8(0x0000000000000000));
936
937 // Do the multiplies, rotating with vext to get all combinations
938 uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
939 uint8x16_t e =
940 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
941 uint8x16_t f =
942 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
943 uint8x16_t g =
944 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
945 uint8x16_t h =
946 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
947 uint8x16_t i =
948 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
949 uint8x16_t j =
950 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
951 uint8x16_t k =
952 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
953
954 // Add cross products
955 uint8x16_t l = veorq_u8(e, f); // L = E + F
956 uint8x16_t m = veorq_u8(g, h); // M = G + H
957 uint8x16_t n = veorq_u8(i, j); // N = I + J
958
959 // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
960 // instructions.
961#if defined(__aarch64__)
962 uint8x16_t lm_p0 = vreinterpretq_u8_u64(
963 vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
964 uint8x16_t lm_p1 = vreinterpretq_u8_u64(
965 vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
966 uint8x16_t nk_p0 = vreinterpretq_u8_u64(
967 vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
968 uint8x16_t nk_p1 = vreinterpretq_u8_u64(
969 vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
970#else
971 uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
972 uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
973 uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
974 uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
975#endif
976 // t0 = (L) (P0 + P1) << 8
977 // t1 = (M) (P2 + P3) << 16
978 uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
979 uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
980 uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
981
982 // t2 = (N) (P4 + P5) << 24
983 // t3 = (K) (P6 + P7) << 32
984 uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
985 uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
986 uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
987
988 // De-interleave
989#if defined(__aarch64__)
990 uint8x16_t t0 = vreinterpretq_u8_u64(
991 vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
992 uint8x16_t t1 = vreinterpretq_u8_u64(
993 vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
994 uint8x16_t t2 = vreinterpretq_u8_u64(
995 vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
996 uint8x16_t t3 = vreinterpretq_u8_u64(
997 vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
998#else
999 uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
1000 uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
1001 uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
1002 uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
1003#endif
1004 // Shift the cross products
1005 uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
1006 uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
1007 uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
1008 uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
1009
1010 // Accumulate the products
1011 uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
1012 uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
1013 uint8x16_t mix = veorq_u8(d, cross1);
1014 uint8x16_t r = veorq_u8(mix, cross2);
1015 return vreinterpretq_u64_u8(r);
1016}
1017#endif // ARMv7 polyfill
1018
1019// C equivalent:
1020// __m128i _mm_shuffle_epi32_default(__m128i a,
1021// __constrange(0, 255) int imm) {
1022// __m128i ret;
1023// ret[0] = a[(imm) & 0x3]; ret[1] = a[((imm) >> 2) & 0x3];
1024// ret[2] = a[((imm) >> 4) & 0x03]; ret[3] = a[((imm) >> 6) & 0x03];
1025// return ret;
1026// }
1027#define _mm_shuffle_epi32_default(a, imm) \
1028 vreinterpretq_m128i_s32(vsetq_lane_s32( \
1029 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
1030 vsetq_lane_s32( \
1031 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
1032 vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), \
1033 ((imm) >> 2) & 0x3), \
1034 vmovq_n_s32(vgetq_lane_s32( \
1035 vreinterpretq_s32_m128i(a), (imm) & (0x3))), \
1036 1), \
1037 2), \
1038 3))
1039
1040// Takes the upper 64 bits of a and places it in the low end of the result
1041// Takes the lower 64 bits of a and places it into the high end of the result.
1043{
1044 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1045 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1046 return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
1047}
1048
1049// takes the lower two 32-bit values from a and swaps them and places in low end
1050// of result takes the higher two 32 bit values from a and swaps them and places
1051// in high end of result.
1053{
1054 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1055 int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
1056 return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
1057}
1058
1059// rotates the least significant 32 bits into the most significant 32 bits, and
1060// shifts the rest down
1062{
1065}
1066
1067// rotates the most significant 32 bits into the least significant 32 bits, and
1068// shifts the rest up
1070{
1073}
1074
1075// gets the lower 64 bits of a, and places it in the upper 64 bits
1076// gets the lower 64 bits of a and places it in the lower 64 bits
1078{
1079 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1080 return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
1081}
1082
1083// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
1084// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
1086{
1087 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1088 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1089 return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
1090}
1091
1092// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
1093// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
1094// places it in the lower 64 bits
1096{
1097 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1098 return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
1099}
1100
1102{
1103 int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
1104 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1105 return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
1106}
1107
1109{
1110 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1111 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1112 return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
1113}
1114
1116{
1117 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1118 int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
1119 return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
1120}
1121
1122#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
1123#define _mm_shuffle_epi32_splat(a, imm) \
1124 vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm)))
1125#else
1126#define _mm_shuffle_epi32_splat(a, imm) \
1127 vreinterpretq_m128i_s32( \
1128 vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))))
1129#endif
1130
1131// NEON does not support a general purpose permute intrinsic.
1132// Shuffle single-precision (32-bit) floating-point elements in a using the
1133// control in imm8, and store the results in dst.
1134//
1135// C equivalent:
1136// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
1137// __constrange(0, 255) int imm) {
1138// __m128 ret;
1139// ret[0] = a[(imm) & 0x3]; ret[1] = a[((imm) >> 2) & 0x3];
1140// ret[2] = b[((imm) >> 4) & 0x03]; ret[3] = b[((imm) >> 6) & 0x03];
1141// return ret;
1142// }
1143//
1144// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
1145#define _mm_shuffle_ps_default(a, b, imm) \
1146 vreinterpretq_m128_f32(vsetq_lane_f32( \
1147 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
1148 vsetq_lane_f32( \
1149 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
1150 vsetq_lane_f32( \
1151 vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
1152 vmovq_n_f32( \
1153 vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))), \
1154 1), \
1155 2), \
1156 3))
1157
1158// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
1159// Store the results in the low 64 bits of dst, with the high 64 bits being
1160// copied from a to dst.
1161// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
1162#define _mm_shufflelo_epi16_function(a, imm) \
1163 _sse2neon_define1( \
1164 __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a); \
1165 int16x4_t lowBits = vget_low_s16(ret); \
1166 ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
1167 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
1168 1); \
1169 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
1170 2); \
1171 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
1172 3); \
1173 _sse2neon_return(vreinterpretq_m128i_s16(ret));)
1174
1175// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
1176// Store the results in the high 64 bits of dst, with the low 64 bits being
1177// copied from a to dst.
1178// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
1179#define _mm_shufflehi_epi16_function(a, imm) \
1180 _sse2neon_define1( \
1181 __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a); \
1182 int16x4_t highBits = vget_high_s16(ret); \
1183 ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
1184 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
1185 5); \
1186 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
1187 6); \
1188 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
1189 7); \
1190 _sse2neon_return(vreinterpretq_m128i_s16(ret));)
1191
1192/* MMX */
1193
1194//_mm_empty is a no-op on arm
1195FORCE_INLINE void _mm_empty(void) {}
1196
1197/* SSE */
1198
1199// Add packed single-precision (32-bit) floating-point elements in a and b, and
1200// store the results in dst.
1201// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
1202FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
1203{
1206}
1207
1208// Add the lower single-precision (32-bit) floating-point element in a and b,
1209// store the result in the lower element of dst, and copy the upper 3 packed
1210// elements from a to the upper elements of dst.
1211// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
1212FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
1213{
1214 float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
1215 float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
1216 // the upper values in the result must be the remnants of <a>.
1217 return vreinterpretq_m128_f32(vaddq_f32(a, value));
1218}
1219
1220// Compute the bitwise AND of packed single-precision (32-bit) floating-point
1221// elements in a and b, and store the results in dst.
1222// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
1223FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
1224{
1227}
1228
1229// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
1230// elements in a and then AND with b, and store the results in dst.
1231// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
1233{
1235 vbicq_s32(vreinterpretq_s32_m128(b),
1236 vreinterpretq_s32_m128(a))); // *NOTE* argument swap
1237}
1238
1239// Average packed unsigned 16-bit integers in a and b, and store the results in
1240// dst.
1241// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
1242FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
1243{
1244 return vreinterpret_m64_u16(
1245 vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
1246}
1247
1248// Average packed unsigned 8-bit integers in a and b, and store the results in
1249// dst.
1250// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
1251FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
1252{
1253 return vreinterpret_m64_u8(
1254 vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1255}
1256
1257// Compare packed single-precision (32-bit) floating-point elements in a and b
1258// for equality, and store the results in dst.
1259// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
1261{
1264}
1265
1266// Compare the lower single-precision (32-bit) floating-point elements in a and
1267// b for equality, store the result in the lower element of dst, and copy the
1268// upper 3 packed elements from a to the upper elements of dst.
1269// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
1271{
1272 return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
1273}
1274
1275// Compare packed single-precision (32-bit) floating-point elements in a and b
1276// for greater-than-or-equal, and store the results in dst.
1277// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
1279{
1282}
1283
1284// Compare the lower single-precision (32-bit) floating-point elements in a and
1285// b for greater-than-or-equal, store the result in the lower element of dst,
1286// and copy the upper 3 packed elements from a to the upper elements of dst.
1287// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
1289{
1290 return _mm_move_ss(a, _mm_cmpge_ps(a, b));
1291}
1292
1293// Compare packed single-precision (32-bit) floating-point elements in a and b
1294// for greater-than, and store the results in dst.
1295// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
1297{
1300}
1301
1302// Compare the lower single-precision (32-bit) floating-point elements in a and
1303// b for greater-than, store the result in the lower element of dst, and copy
1304// the upper 3 packed elements from a to the upper elements of dst.
1305// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
1307{
1308 return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
1309}
1310
1311// Compare packed single-precision (32-bit) floating-point elements in a and b
1312// for less-than-or-equal, and store the results in dst.
1313// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
1315{
1318}
1319
1320// Compare the lower single-precision (32-bit) floating-point elements in a and
1321// b for less-than-or-equal, store the result in the lower element of dst, and
1322// copy the upper 3 packed elements from a to the upper elements of dst.
1323// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
1325{
1326 return _mm_move_ss(a, _mm_cmple_ps(a, b));
1327}
1328
1329// Compare packed single-precision (32-bit) floating-point elements in a and b
1330// for less-than, and store the results in dst.
1331// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
1333{
1336}
1337
1338// Compare the lower single-precision (32-bit) floating-point elements in a and
1339// b for less-than, store the result in the lower element of dst, and copy the
1340// upper 3 packed elements from a to the upper elements of dst.
1341// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
1343{
1344 return _mm_move_ss(a, _mm_cmplt_ps(a, b));
1345}
1346
1347// Compare packed single-precision (32-bit) floating-point elements in a and b
1348// for not-equal, and store the results in dst.
1349// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
1351{
1352 return vreinterpretq_m128_u32(vmvnq_u32(
1354}
1355
1356// Compare the lower single-precision (32-bit) floating-point elements in a and
1357// b for not-equal, store the result in the lower element of dst, and copy the
1358// upper 3 packed elements from a to the upper elements of dst.
1359// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
1361{
1362 return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
1363}
1364
1365// Compare packed single-precision (32-bit) floating-point elements in a and b
1366// for not-greater-than-or-equal, and store the results in dst.
1367// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
1369{
1370 return vreinterpretq_m128_u32(vmvnq_u32(
1372}
1373
1374// Compare the lower single-precision (32-bit) floating-point elements in a and
1375// b for not-greater-than-or-equal, store the result in the lower element of
1376// dst, and copy the upper 3 packed elements from a to the upper elements of
1377// dst.
1378// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
1380{
1381 return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
1382}
1383
1384// Compare packed single-precision (32-bit) floating-point elements in a and b
1385// for not-greater-than, and store the results in dst.
1386// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
1388{
1389 return vreinterpretq_m128_u32(vmvnq_u32(
1391}
1392
1393// Compare the lower single-precision (32-bit) floating-point elements in a and
1394// b for not-greater-than, store the result in the lower element of dst, and
1395// copy the upper 3 packed elements from a to the upper elements of dst.
1396// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
1398{
1399 return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
1400}
1401
1402// Compare packed single-precision (32-bit) floating-point elements in a and b
1403// for not-less-than-or-equal, and store the results in dst.
1404// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
1406{
1407 return vreinterpretq_m128_u32(vmvnq_u32(
1409}
1410
1411// Compare the lower single-precision (32-bit) floating-point elements in a and
1412// b for not-less-than-or-equal, store the result in the lower element of dst,
1413// and copy the upper 3 packed elements from a to the upper elements of dst.
1414// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
1416{
1417 return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
1418}
1419
1420// Compare packed single-precision (32-bit) floating-point elements in a and b
1421// for not-less-than, and store the results in dst.
1422// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
1424{
1425 return vreinterpretq_m128_u32(vmvnq_u32(
1427}
1428
1429// Compare the lower single-precision (32-bit) floating-point elements in a and
1430// b for not-less-than, store the result in the lower element of dst, and copy
1431// the upper 3 packed elements from a to the upper elements of dst.
1432// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
1434{
1435 return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
1436}
1437
1438// Compare packed single-precision (32-bit) floating-point elements in a and b
1439// to see if neither is NaN, and store the results in dst.
1440// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
1441//
1442// See also:
1443// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
1444// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
1446{
1447 // Note: NEON does not have ordered compare builtin
1448 // Need to compare a eq a and b eq b to check for NaN
1449 // Do AND of results to get final
1450 uint32x4_t ceqaa =
1452 uint32x4_t ceqbb =
1454 return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
1455}
1456
1457// Compare the lower single-precision (32-bit) floating-point elements in a and
1458// b to see if neither is NaN, store the result in the lower element of dst, and
1459// copy the upper 3 packed elements from a to the upper elements of dst.
1460// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
1462{
1463 return _mm_move_ss(a, _mm_cmpord_ps(a, b));
1464}
1465
1466// Compare packed single-precision (32-bit) floating-point elements in a and b
1467// to see if either is NaN, and store the results in dst.
1468// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps
1470{
1471 uint32x4_t f32a =
1473 uint32x4_t f32b =
1475 return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
1476}
1477
1478// Compare the lower single-precision (32-bit) floating-point elements in a and
1479// b to see if either is NaN, store the result in the lower element of dst, and
1480// copy the upper 3 packed elements from a to the upper elements of dst.
1481// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss
1483{
1484 return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
1485}
1486
1487// Compare the lower single-precision (32-bit) floating-point element in a and b
1488// for equality, and return the boolean result (0 or 1).
1489// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss
1490FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
1491{
1492 uint32x4_t a_eq_b =
1494 return vgetq_lane_u32(a_eq_b, 0) & 0x1;
1495}
1496
1497// Compare the lower single-precision (32-bit) floating-point element in a and b
1498// for greater-than-or-equal, and return the boolean result (0 or 1).
1499// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss
1500FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
1501{
1502 uint32x4_t a_ge_b =
1504 return vgetq_lane_u32(a_ge_b, 0) & 0x1;
1505}
1506
1507// Compare the lower single-precision (32-bit) floating-point element in a and b
1508// for greater-than, and return the boolean result (0 or 1).
1509// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss
1510FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
1511{
1512 uint32x4_t a_gt_b =
1514 return vgetq_lane_u32(a_gt_b, 0) & 0x1;
1515}
1516
1517// Compare the lower single-precision (32-bit) floating-point element in a and b
1518// for less-than-or-equal, and return the boolean result (0 or 1).
1519// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss
1520FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
1521{
1522 uint32x4_t a_le_b =
1524 return vgetq_lane_u32(a_le_b, 0) & 0x1;
1525}
1526
1527// Compare the lower single-precision (32-bit) floating-point element in a and b
1528// for less-than, and return the boolean result (0 or 1).
1529// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss
1530FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
1531{
1532 uint32x4_t a_lt_b =
1534 return vgetq_lane_u32(a_lt_b, 0) & 0x1;
1535}
1536
1537// Compare the lower single-precision (32-bit) floating-point element in a and b
1538// for not-equal, and return the boolean result (0 or 1).
1539// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss
1540FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
1541{
1542 return !_mm_comieq_ss(a, b);
1543}
1544
1545// Convert packed signed 32-bit integers in b to packed single-precision
1546// (32-bit) floating-point elements, store the results in the lower 2 elements
1547// of dst, and copy the upper 2 packed elements from a to the upper elements of
1548// dst.
1549// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
1551{
1553 vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1554 vget_high_f32(vreinterpretq_f32_m128(a))));
1555}
1556
1557// Convert packed single-precision (32-bit) floating-point elements in a to
1558// packed 32-bit integers, and store the results in dst.
1559// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
1561{
1562#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
1563 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1564 return vreinterpret_m64_s32(
1565 vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
1566#else
1567 return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
1569#endif
1570}
1571
1572// Convert the signed 32-bit integer b to a single-precision (32-bit)
1573// floating-point element, store the result in the lower element of dst, and
1574// copy the upper 3 packed elements from a to the upper elements of dst.
1575// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
1576FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
1577{
1579 vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1580}
1581
1582// Convert the lower single-precision (32-bit) floating-point element in a to a
1583// 32-bit integer, and store the result in dst.
1584// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
1585FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
1586{
1587#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
1588 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1589 return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
1590 0);
1591#else
1592 float32_t data = vgetq_lane_f32(
1594 return (int32_t) data;
1595#endif
1596}
1597
1598// Convert packed 16-bit integers in a to packed single-precision (32-bit)
1599// floating-point elements, and store the results in dst.
1600// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
1602{
1604 vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
1605}
1606
1607// Convert packed 32-bit integers in b to packed single-precision (32-bit)
1608// floating-point elements, store the results in the lower 2 elements of dst,
1609// and copy the upper 2 packed elements from a to the upper elements of dst.
1610// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
1612{
1614 vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1615 vget_high_f32(vreinterpretq_f32_m128(a))));
1616}
1617
1618// Convert packed signed 32-bit integers in a to packed single-precision
1619// (32-bit) floating-point elements, store the results in the lower 2 elements
1620// of dst, then convert the packed signed 32-bit integers in b to
1621// single-precision (32-bit) floating-point element, and store the results in
1622// the upper 2 elements of dst.
1623// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
1625{
1626 return vreinterpretq_m128_f32(vcvtq_f32_s32(
1627 vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
1628}
1629
1630// Convert the lower packed 8-bit integers in a to packed single-precision
1631// (32-bit) floating-point elements, and store the results in dst.
1632// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
1634{
1635 return vreinterpretq_m128_f32(vcvtq_f32_s32(
1636 vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
1637}
1638
1639// Convert packed single-precision (32-bit) floating-point elements in a to
1640// packed 16-bit integers, and store the results in dst. Note: this intrinsic
1641// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
1642// 0x7FFFFFFF.
1643// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
1645{
1646 return vreinterpret_m64_s16(
1648}
1649
1650// Convert packed single-precision (32-bit) floating-point elements in a to
1651// packed 32-bit integers, and store the results in dst.
1652// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
1653#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
1654
1655// Convert packed single-precision (32-bit) floating-point elements in a to
1656// packed 8-bit integers, and store the results in lower 4 elements of dst.
1657// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
1658// between 0x7F and 0x7FFFFFFF.
1659// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
1661{
1662 return vreinterpret_m64_s8(vqmovn_s16(
1663 vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0))));
1664}
1665
1666// Convert packed unsigned 16-bit integers in a to packed single-precision
1667// (32-bit) floating-point elements, and store the results in dst.
1668// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
1670{
1672 vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
1673}
1674
1675// Convert the lower packed unsigned 8-bit integers in a to packed
1676// single-precision (32-bit) floating-point elements, and store the results in
1677// dst.
1678// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
1680{
1681 return vreinterpretq_m128_f32(vcvtq_f32_u32(
1682 vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
1683}
1684
1685// Convert the signed 32-bit integer b to a single-precision (32-bit)
1686// floating-point element, store the result in the lower element of dst, and
1687// copy the upper 3 packed elements from a to the upper elements of dst.
1688// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
1689#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
1690
1691// Convert the signed 64-bit integer b to a single-precision (32-bit)
1692// floating-point element, store the result in the lower element of dst, and
1693// copy the upper 3 packed elements from a to the upper elements of dst.
1694// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
1695FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
1696{
1698 vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1699}
1700
1701// Copy the lower single-precision (32-bit) floating-point element of a to dst.
1702// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
1703FORCE_INLINE float _mm_cvtss_f32(__m128 a)
1704{
1705 return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1706}
1707
1708// Convert the lower single-precision (32-bit) floating-point element in a to a
1709// 32-bit integer, and store the result in dst.
1710// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
1711#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
1712
1713// Convert the lower single-precision (32-bit) floating-point element in a to a
1714// 64-bit integer, and store the result in dst.
1715// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
1716FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
1717{
1718#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
1719 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1720 return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
1721#else
1722 float32_t data = vgetq_lane_f32(
1724 return (int64_t) data;
1725#endif
1726}
1727
1728// Convert packed single-precision (32-bit) floating-point elements in a to
1729// packed 32-bit integers with truncation, and store the results in dst.
1730// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
1732{
1733 return vreinterpret_m64_s32(
1734 vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
1735}
1736
1737// Convert the lower single-precision (32-bit) floating-point element in a to a
1738// 32-bit integer with truncation, and store the result in dst.
1739// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
1740FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
1741{
1742 return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
1743}
1744
1745// Convert packed single-precision (32-bit) floating-point elements in a to
1746// packed 32-bit integers with truncation, and store the results in dst.
1747// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
1748#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
1749
1750// Convert the lower single-precision (32-bit) floating-point element in a to a
1751// 32-bit integer with truncation, and store the result in dst.
1752// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
1753#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
1754
1755// Convert the lower single-precision (32-bit) floating-point element in a to a
1756// 64-bit integer with truncation, and store the result in dst.
1757// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
1758FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
1759{
1760 return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1761}
1762
1763// Divide packed single-precision (32-bit) floating-point elements in a by
1764// packed elements in b, and store the results in dst.
1765// Due to ARMv7-A NEON's lack of a precise division intrinsic, we implement
1766// division by multiplying a by b's reciprocal before using the Newton-Raphson
1767// method to approximate the results.
1768// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
1769FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
1770{
1771#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
1774#else
1775 float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
1776 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1777 // Additional Netwon-Raphson iteration for accuracy
1778 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1779 return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
1780#endif
1781}
1782
1783// Divide the lower single-precision (32-bit) floating-point element in a by the
1784// lower single-precision (32-bit) floating-point element in b, store the result
1785// in the lower element of dst, and copy the upper 3 packed elements from a to
1786// the upper elements of dst.
1787// Warning: ARMv7-A does not produce the same result compared to Intel and not
1788// IEEE-compliant.
1789// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss
1790FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
1791{
1792 float32_t value =
1793 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
1795 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1796}
1797
1798// Extract a 16-bit integer from a, selected with imm8, and store the result in
1799// the lower element of dst.
1800// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
1801#define _mm_extract_pi16(a, imm) \
1802 (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
1803
1804// Free aligned memory that was allocated with _mm_malloc.
1805// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
1806#if !defined(SSE2NEON_ALLOC_DEFINED)
1807FORCE_INLINE void _mm_free(void *addr)
1808{
1809#if defined(_WIN32)
1810 _aligned_free(addr);
1811#else
1812 free(addr);
1813#endif
1814}
1815#endif
1816
1817FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
1818{
1819 uint64_t value;
1820#if defined(_MSC_VER) && !defined(__clang__)
1821 value = _ReadStatusReg(ARM64_FPCR);
1822#else
1823 __asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */
1824#endif
1825 return value;
1826}
1827
1828FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
1829{
1830#if defined(_MSC_VER) && !defined(__clang__)
1831 _WriteStatusReg(ARM64_FPCR, value);
1832#else
1833 __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */
1834#endif
1835}
1836
1837// Macro: Get the flush zero bits from the MXCSR control and status register.
1838// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
1839// _MM_FLUSH_ZERO_OFF
1840// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
1841FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
1842{
1843 union {
1844 fpcr_bitfield field;
1845#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
1846 uint64_t value;
1847#else
1848 uint32_t value;
1849#endif
1850 } r;
1851
1852#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
1853 r.value = _sse2neon_get_fpcr();
1854#else
1855 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1856#endif
1857
1858 return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
1859}
1860
1861// Macro: Get the rounding mode bits from the MXCSR control and status register.
1862// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
1863// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
1864// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
1865FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void)
1866{
1867 switch (fegetround()) {
1868 case FE_TONEAREST:
1869 return _MM_ROUND_NEAREST;
1870 case FE_DOWNWARD:
1871 return _MM_ROUND_DOWN;
1872 case FE_UPWARD:
1873 return _MM_ROUND_UP;
1874 case FE_TOWARDZERO:
1875 return _MM_ROUND_TOWARD_ZERO;
1876 default:
1877 // fegetround() must return _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
1878 // _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO on success. all the other error
1879 // cases we treat them as FE_TOWARDZERO (truncate).
1880 return _MM_ROUND_TOWARD_ZERO;
1881 }
1882}
1883
1884// Copy a to dst, and insert the 16-bit integer i into dst at the location
1885// specified by imm8.
1886// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
1887#define _mm_insert_pi16(a, b, imm) \
1888 vreinterpret_m64_s16(vset_lane_s16((b), vreinterpret_s16_m64(a), (imm)))
1889
1890// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
1891// elements) from memory into dst. mem_addr must be aligned on a 16-byte
1892// boundary or a general-protection exception may be generated.
1893// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps
1894FORCE_INLINE __m128 _mm_load_ps(const float *p)
1895{
1896 return vreinterpretq_m128_f32(vld1q_f32(p));
1897}
1898
1899// Load a single-precision (32-bit) floating-point element from memory into all
1900// elements of dst.
1901//
1902// dst[31:0] := MEM[mem_addr+31:mem_addr]
1903// dst[63:32] := MEM[mem_addr+31:mem_addr]
1904// dst[95:64] := MEM[mem_addr+31:mem_addr]
1905// dst[127:96] := MEM[mem_addr+31:mem_addr]
1906//
1907// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
1908#define _mm_load_ps1 _mm_load1_ps
1909
1910// Load a single-precision (32-bit) floating-point element from memory into the
1911// lower of dst, and zero the upper 3 elements. mem_addr does not need to be
1912// aligned on any particular boundary.
1913// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss
1914FORCE_INLINE __m128 _mm_load_ss(const float *p)
1915{
1916 return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
1917}
1918
1919// Load a single-precision (32-bit) floating-point element from memory into all
1920// elements of dst.
1921// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
1922FORCE_INLINE __m128 _mm_load1_ps(const float *p)
1923{
1924 return vreinterpretq_m128_f32(vld1q_dup_f32(p));
1925}
1926
1927// Load 2 single-precision (32-bit) floating-point elements from memory into the
1928// upper 2 elements of dst, and copy the lower 2 elements from a to dst.
1929// mem_addr does not need to be aligned on any particular boundary.
1930// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi
1931FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
1932{
1934 vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
1935}
1936
1937// Load 2 single-precision (32-bit) floating-point elements from memory into the
1938// lower 2 elements of dst, and copy the upper 2 elements from a to dst.
1939// mem_addr does not need to be aligned on any particular boundary.
1940// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi
1941FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
1942{
1944 vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
1945}
1946
1947// Load 4 single-precision (32-bit) floating-point elements from memory into dst
1948// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
1949// general-protection exception may be generated.
1950// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
1951FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
1952{
1953 float32x4_t v = vrev64q_f32(vld1q_f32(p));
1954 return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
1955}
1956
1957// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
1958// elements) from memory into dst. mem_addr does not need to be aligned on any
1959// particular boundary.
1960// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
1961FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
1962{
1963 // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
1964 // equivalent for neon
1965 return vreinterpretq_m128_f32(vld1q_f32(p));
1966}
1967
1968// Load unaligned 16-bit integer from memory into the first element of dst.
1969// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
1970FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
1971{
1973 vsetq_lane_s16(*(const unaligned_int16_t *) p, vdupq_n_s16(0), 0));
1974}
1975
1976// Load unaligned 64-bit integer from memory into the first element of dst.
1977// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
1978FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
1979{
1981 vsetq_lane_s64(*(const unaligned_int64_t *) p, vdupq_n_s64(0), 0));
1982}
1983
1984// Allocate size bytes of memory, aligned to the alignment specified in align,
1985// and return a pointer to the allocated memory. _mm_free should be used to free
1986// memory that is allocated with _mm_malloc.
1987// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc
1988#if !defined(SSE2NEON_ALLOC_DEFINED)
1989FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
1990{
1991#if defined(_WIN32)
1992 return _aligned_malloc(size, align);
1993#else
1994 void *ptr;
1995 if (align == 1)
1996 return malloc(size);
1997 if (align == 2 || (sizeof(void *) == 8 && align == 4))
1998 align = sizeof(void *);
1999 if (!posix_memalign(&ptr, align, size))
2000 return ptr;
2001 return NULL;
2002#endif
2003}
2004#endif
2005
2006// Conditionally store 8-bit integer elements from a into memory using mask
2007// (elements are not stored when the highest bit is not set in the corresponding
2008// element) and a non-temporal memory hint.
2009// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
2010FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
2011{
2012 int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
2013 __m128 b = _mm_load_ps((const float *) mem_addr);
2014 int8x8_t masked =
2015 vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
2016 vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
2017 vst1_s8((int8_t *) mem_addr, masked);
2018}
2019
2020// Conditionally store 8-bit integer elements from a into memory using mask
2021// (elements are not stored when the highest bit is not set in the corresponding
2022// element) and a non-temporal memory hint.
2023// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
2024#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
2025
2026// Compare packed signed 16-bit integers in a and b, and store packed maximum
2027// values in dst.
2028// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
2029FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
2030{
2031 return vreinterpret_m64_s16(
2033}
2034
2035// Compare packed single-precision (32-bit) floating-point elements in a and b,
2036// and store packed maximum values in dst. dst does not follow the IEEE Standard
2037// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
2038// signed-zero values.
2039// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
2040FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
2041{
2042#if SSE2NEON_PRECISE_MINMAX
2043 float32x4_t _a = vreinterpretq_f32_m128(a);
2044 float32x4_t _b = vreinterpretq_f32_m128(b);
2045 return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
2046#else
2049#endif
2050}
2051
2052// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
2053// values in dst.
2054// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
2055FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
2056{
2057 return vreinterpret_m64_u8(
2059}
2060
2061// Compare the lower single-precision (32-bit) floating-point elements in a and
2062// b, store the maximum value in the lower element of dst, and copy the upper 3
2063// packed elements from a to the upper element of dst. dst does not follow the
2064// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when
2065// inputs are NaN or signed-zero values.
2066// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss
2067FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
2068{
2069 float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
2071 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2072}
2073
2074// Compare packed signed 16-bit integers in a and b, and store packed minimum
2075// values in dst.
2076// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
2077FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
2078{
2079 return vreinterpret_m64_s16(
2081}
2082
2083// Compare packed single-precision (32-bit) floating-point elements in a and b,
2084// and store packed minimum values in dst. dst does not follow the IEEE Standard
2085// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
2086// signed-zero values.
2087// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
2088FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
2089{
2090#if SSE2NEON_PRECISE_MINMAX
2091 float32x4_t _a = vreinterpretq_f32_m128(a);
2092 float32x4_t _b = vreinterpretq_f32_m128(b);
2093 return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
2094#else
2097#endif
2098}
2099
2100// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2101// values in dst.
2102// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
2103FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
2104{
2105 return vreinterpret_m64_u8(
2107}
2108
2109// Compare the lower single-precision (32-bit) floating-point elements in a and
2110// b, store the minimum value in the lower element of dst, and copy the upper 3
2111// packed elements from a to the upper element of dst. dst does not follow the
2112// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
2113// inputs are NaN or signed-zero values.
2114// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss
2115FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
2116{
2117 float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
2119 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2120}
2121
2122// Move the lower single-precision (32-bit) floating-point element from b to the
2123// lower element of dst, and copy the upper 3 packed elements from a to the
2124// upper elements of dst.
2125// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss
2127{
2129 vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
2130 vreinterpretq_f32_m128(a), 0));
2131}
2132
2133// Move the upper 2 single-precision (32-bit) floating-point elements from b to
2134// the lower 2 elements of dst, and copy the upper 2 elements from a to the
2135// upper 2 elements of dst.
2136// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps
2138{
2139#if defined(aarch64__)
2142#else
2143 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
2144 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
2145 return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
2146#endif
2147}
2148
2149// Move the lower 2 single-precision (32-bit) floating-point elements from b to
2150// the upper 2 elements of dst, and copy the lower 2 elements from a to the
2151// lower 2 elements of dst.
2152// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps
2153FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
2154{
2155 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
2156 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
2157 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
2158}
2159
2160// Create mask from the most significant bit of each 8-bit element in a, and
2161// store the result in dst.
2162// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
2163FORCE_INLINE int _mm_movemask_pi8(__m64 a)
2164{
2165 uint8x8_t input = vreinterpret_u8_m64(a);
2166#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
2167 static const int8_t shift[8] = {0, 1, 2, 3, 4, 5, 6, 7};
2168 uint8x8_t tmp = vshr_n_u8(input, 7);
2169 return vaddv_u8(vshl_u8(tmp, vld1_s8(shift)));
2170#else
2171 // Refer the implementation of `_mm_movemask_epi8`
2172 uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
2173 uint32x2_t paired16 =
2174 vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
2175 uint8x8_t paired32 =
2176 vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
2177 return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
2178#endif
2179}
2180
2181// Set each bit of mask dst based on the most significant bit of the
2182// corresponding packed single-precision (32-bit) floating-point element in a.
2183// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps
2184FORCE_INLINE int _mm_movemask_ps(__m128 a)
2185{
2186 uint32x4_t input = vreinterpretq_u32_m128(a);
2187#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
2188 static const int32_t shift[4] = {0, 1, 2, 3};
2189 uint32x4_t tmp = vshrq_n_u32(input, 31);
2190 return vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift)));
2191#else
2192 // Uses the exact same method as _mm_movemask_epi8, see that for details.
2193 // Shift out everything but the sign bits with a 32-bit unsigned shift
2194 // right.
2195 uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2196 // Merge the two pairs together with a 64-bit unsigned shift right + add.
2197 uint8x16_t paired =
2198 vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2199 // Extract the result.
2200 return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2201#endif
2202}
2203
2204// Multiply packed single-precision (32-bit) floating-point elements in a and b,
2205// and store the results in dst.
2206// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
2207FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
2208{
2211}
2212
2213// Multiply the lower single-precision (32-bit) floating-point element in a and
2214// b, store the result in the lower element of dst, and copy the upper 3 packed
2215// elements from a to the upper elements of dst.
2216// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
2217FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
2218{
2219 return _mm_move_ss(a, _mm_mul_ps(a, b));
2220}
2221
2222// Multiply the packed unsigned 16-bit integers in a and b, producing
2223// intermediate 32-bit integers, and store the high 16 bits of the intermediate
2224// integers in dst.
2225// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
2227{
2228 return vreinterpret_m64_u16(vshrn_n_u32(
2229 vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
2230}
2231
2232// Compute the bitwise OR of packed single-precision (32-bit) floating-point
2233// elements in a and b, and store the results in dst.
2234// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps
2235FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
2236{
2239}
2240
2241// Average packed unsigned 8-bit integers in a and b, and store the results in
2242// dst.
2243// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
2244#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2245
2246// Average packed unsigned 16-bit integers in a and b, and store the results in
2247// dst.
2248// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
2249#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2250
2251// Extract a 16-bit integer from a, selected with imm8, and store the result in
2252// the lower element of dst.
2253// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
2254#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
2255
2256// Copy a to dst, and insert the 16-bit integer i into dst at the location
2257// specified by imm8.
2258// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
2259#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
2260
2261// Compare packed signed 16-bit integers in a and b, and store packed maximum
2262// values in dst.
2263// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
2264#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
2265
2266// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
2267// values in dst.
2268// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
2269#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
2270
2271// Compare packed signed 16-bit integers in a and b, and store packed minimum
2272// values in dst.
2273// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
2274#define _m_pminsw(a, b) _mm_min_pi16(a, b)
2275
2276// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2277// values in dst.
2278// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
2279#define _m_pminub(a, b) _mm_min_pu8(a, b)
2280
2281// Create mask from the most significant bit of each 8-bit element in a, and
2282// store the result in dst.
2283// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
2284#define _m_pmovmskb(a) _mm_movemask_pi8(a)
2285
2286// Multiply the packed unsigned 16-bit integers in a and b, producing
2287// intermediate 32-bit integers, and store the high 16 bits of the intermediate
2288// integers in dst.
2289// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
2290#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2291
2292// Fetch the line of data from memory that contains address p to a location in
2293// the cache hierarchy specified by the locality hint i.
2294// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch
2295FORCE_INLINE void _mm_prefetch(char const *p, int i)
2296{
2297 (void) i;
2298#if defined(_MSC_VER) && !defined(__clang__)
2299 switch (i) {
2300 case _MM_HINT_NTA:
2301 __prefetch2(p, 1);
2302 break;
2303 case _MM_HINT_T0:
2304 __prefetch2(p, 0);
2305 break;
2306 case _MM_HINT_T1:
2307 __prefetch2(p, 2);
2308 break;
2309 case _MM_HINT_T2:
2310 __prefetch2(p, 4);
2311 break;
2312 }
2313#else
2314 switch (i) {
2315 case _MM_HINT_NTA:
2316 __builtin_prefetch(p, 0, 0);
2317 break;
2318 case _MM_HINT_T0:
2319 __builtin_prefetch(p, 0, 3);
2320 break;
2321 case _MM_HINT_T1:
2322 __builtin_prefetch(p, 0, 2);
2323 break;
2324 case _MM_HINT_T2:
2325 __builtin_prefetch(p, 0, 1);
2326 break;
2327 }
2328#endif
2329}
2330
2331// Compute the absolute differences of packed unsigned 8-bit integers in a and
2332// b, then horizontally sum each consecutive 8 differences to produce four
2333// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2334// 16 bits of dst.
2335// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
2336#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
2337
2338// Shuffle 16-bit integers in a using the control in imm8, and store the results
2339// in dst.
2340// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
2341#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
2342
2343// Compute the approximate reciprocal of packed single-precision (32-bit)
2344// floating-point elements in a, and store the results in dst. The maximum
2345// relative error for this approximation is less than 1.5*2^-12.
2346// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
2347FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
2348{
2349 float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
2350 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2351#if SSE2NEON_PRECISE_DIV
2352 // Additional Netwon-Raphson iteration for accuracy
2353 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2354#endif
2355 return vreinterpretq_m128_f32(recip);
2356}
2357
2358// Compute the approximate reciprocal of the lower single-precision (32-bit)
2359// floating-point element in a, store the result in the lower element of dst,
2360// and copy the upper 3 packed elements from a to the upper elements of dst. The
2361// maximum relative error for this approximation is less than 1.5*2^-12.
2362// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
2363FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
2364{
2365 return _mm_move_ss(a, _mm_rcp_ps(a));
2366}
2367
2368// Compute the approximate reciprocal square root of packed single-precision
2369// (32-bit) floating-point elements in a, and store the results in dst. The
2370// maximum relative error for this approximation is less than 1.5*2^-12.
2371// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
2373{
2374 float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2375
2376 // Generate masks for detecting whether input has any 0.0f/-0.0f
2377 // (which becomes positive/negative infinity by IEEE-754 arithmetic rules).
2378 const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2379 const uint32x4_t neg_inf = vdupq_n_u32(0xFF800000);
2380 const uint32x4_t has_pos_zero =
2381 vceqq_u32(pos_inf, vreinterpretq_u32_f32(out));
2382 const uint32x4_t has_neg_zero =
2383 vceqq_u32(neg_inf, vreinterpretq_u32_f32(out));
2384
2385 out = vmulq_f32(
2386 out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2387#if SSE2NEON_PRECISE_SQRT
2388 // Additional Netwon-Raphson iteration for accuracy
2389 out = vmulq_f32(
2390 out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2391#endif
2392
2393 // Set output vector element to infinity/negative-infinity if
2394 // the corresponding input vector element is 0.0f/-0.0f.
2395 out = vbslq_f32(has_pos_zero, (float32x4_t) pos_inf, out);
2396 out = vbslq_f32(has_neg_zero, (float32x4_t) neg_inf, out);
2397
2398 return vreinterpretq_m128_f32(out);
2399}
2400
2401// Compute the approximate reciprocal square root of the lower single-precision
2402// (32-bit) floating-point element in a, store the result in the lower element
2403// of dst, and copy the upper 3 packed elements from a to the upper elements of
2404// dst.
2405// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
2407{
2408 return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
2409}
2410
2411// Compute the absolute differences of packed unsigned 8-bit integers in a and
2412// b, then horizontally sum each consecutive 8 differences to produce four
2413// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2414// 16 bits of dst.
2415// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
2416FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
2417{
2418 uint64x1_t t = vpaddl_u32(vpaddl_u16(
2419 vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
2420 return vreinterpret_m64_u16(
2421 vset_lane_u16((uint16_t) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
2422}
2423
2424// Macro: Set the flush zero bits of the MXCSR control and status register to
2425// the value in unsigned 32-bit integer a. The flush zero may contain any of the
2426// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
2427// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
2428FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
2429{
2430 // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
2431 // regardless of the value of the FZ bit.
2432 union {
2433 fpcr_bitfield field;
2434#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
2435 uint64_t value;
2436#else
2437 uint32_t value;
2438#endif
2439 } r;
2440
2441#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
2442 r.value = _sse2neon_get_fpcr();
2443#else
2444 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2445#endif
2446
2447 r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
2448
2449#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
2450 _sse2neon_set_fpcr(r.value);
2451#else
2452 __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
2453#endif
2454}
2455
2456// Set packed single-precision (32-bit) floating-point elements in dst with the
2457// supplied values.
2458// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
2459FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
2460{
2461 float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
2462 return vreinterpretq_m128_f32(vld1q_f32(data));
2463}
2464
2465// Broadcast single-precision (32-bit) floating-point value a to all elements of
2466// dst.
2467// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1
2468FORCE_INLINE __m128 _mm_set_ps1(float _w)
2469{
2470 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2471}
2472
2473// Macro: Set the rounding mode bits of the MXCSR control and status register to
2474// the value in unsigned 32-bit integer a. The rounding mode may contain any of
2475// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
2476// _MM_ROUND_TOWARD_ZERO
2477// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
2478FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
2479{
2480 switch (rounding) {
2481 case _MM_ROUND_NEAREST:
2482 rounding = FE_TONEAREST;
2483 break;
2484 case _MM_ROUND_DOWN:
2485 rounding = FE_DOWNWARD;
2486 break;
2487 case _MM_ROUND_UP:
2488 rounding = FE_UPWARD;
2489 break;
2491 rounding = FE_TOWARDZERO;
2492 break;
2493 default:
2494 // rounding must be _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
2495 // _MM_ROUND_TOWARD_ZERO. all the other invalid values we treat them as
2496 // FE_TOWARDZERO (truncate).
2497 rounding = FE_TOWARDZERO;
2498 }
2499 fesetround(rounding);
2500}
2501
2502// Copy single-precision (32-bit) floating-point element a to the lower element
2503// of dst, and zero the upper 3 elements.
2504// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
2505FORCE_INLINE __m128 _mm_set_ss(float a)
2506{
2507 return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
2508}
2509
2510// Broadcast single-precision (32-bit) floating-point value a to all elements of
2511// dst.
2512// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps
2513FORCE_INLINE __m128 _mm_set1_ps(float _w)
2514{
2515 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2516}
2517
2518// Set the MXCSR control and status register with the value in unsigned 32-bit
2519// integer a.
2520// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr
2521// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
2522FORCE_INLINE void _mm_setcsr(unsigned int a)
2523{
2525}
2526
2527// Get the unsigned 32-bit value of the MXCSR control and status register.
2528// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr
2529// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
2530FORCE_INLINE unsigned int _mm_getcsr(void)
2531{
2532 return _MM_GET_ROUNDING_MODE();
2533}
2534
2535// Set packed single-precision (32-bit) floating-point elements in dst with the
2536// supplied values in reverse order.
2537// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps
2538FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
2539{
2540 float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
2541 return vreinterpretq_m128_f32(vld1q_f32(data));
2542}
2543
2544// Return vector of type __m128 with all elements set to zero.
2545// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps
2546FORCE_INLINE __m128 _mm_setzero_ps(void)
2547{
2548 return vreinterpretq_m128_f32(vdupq_n_f32(0));
2549}
2550
2551// Shuffle 16-bit integers in a using the control in imm8, and store the results
2552// in dst.
2553// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
2554#ifdef _sse2neon_shuffle
2555#define _mm_shuffle_pi16(a, imm) \
2556 vreinterpret_m64_s16(vshuffle_s16( \
2557 vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), ((imm) & 0x3), \
2558 (((imm) >> 2) & 0x3), (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3)))
2559#else
2560#define _mm_shuffle_pi16(a, imm) \
2561 _sse2neon_define1( \
2562 __m64, a, int16x4_t ret; \
2563 ret = vmov_n_s16( \
2564 vget_lane_s16(vreinterpret_s16_m64(_a), (imm) & (0x3))); \
2565 ret = vset_lane_s16( \
2566 vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 2) & 0x3), ret, \
2567 1); \
2568 ret = vset_lane_s16( \
2569 vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 4) & 0x3), ret, \
2570 2); \
2571 ret = vset_lane_s16( \
2572 vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 6) & 0x3), ret, \
2573 3); \
2574 _sse2neon_return(vreinterpret_m64_s16(ret));)
2575#endif
2576
2577// Perform a serializing operation on all store-to-memory instructions that were
2578// issued prior to this instruction. Guarantees that every store instruction
2579// that precedes, in program order, is globally visible before any store
2580// instruction which follows the fence in program order.
2581// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence
2582FORCE_INLINE void _mm_sfence(void)
2583{
2585}
2586
2587// Perform a serializing operation on all load-from-memory and store-to-memory
2588// instructions that were issued prior to this instruction. Guarantees that
2589// every memory access that precedes, in program order, the memory fence
2590// instruction is globally visible before any memory instruction which follows
2591// the fence in program order.
2592// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence
2593FORCE_INLINE void _mm_mfence(void)
2594{
2596}
2597
2598// Perform a serializing operation on all load-from-memory instructions that
2599// were issued prior to this instruction. Guarantees that every load instruction
2600// that precedes, in program order, is globally visible before any load
2601// instruction which follows the fence in program order.
2602// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence
2603FORCE_INLINE void _mm_lfence(void)
2604{
2606}
2607
2608// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
2609// int imm)
2610#ifdef _sse2neon_shuffle
2611#define _mm_shuffle_ps(a, b, imm) \
2612 __extension__({ \
2613 float32x4_t _input1 = vreinterpretq_f32_m128(a); \
2614 float32x4_t _input2 = vreinterpretq_f32_m128(b); \
2615 float32x4_t _shuf = \
2616 vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
2617 (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
2618 vreinterpretq_m128_f32(_shuf); \
2619 })
2620#else // generic
2621#define _mm_shuffle_ps(a, b, imm) \
2622 _sse2neon_define2( \
2623 __m128, a, b, __m128 ret; switch (imm) { \
2624 case _MM_SHUFFLE(1, 0, 3, 2): \
2625 ret = _mm_shuffle_ps_1032(_a, _b); \
2626 break; \
2627 case _MM_SHUFFLE(2, 3, 0, 1): \
2628 ret = _mm_shuffle_ps_2301(_a, _b); \
2629 break; \
2630 case _MM_SHUFFLE(0, 3, 2, 1): \
2631 ret = _mm_shuffle_ps_0321(_a, _b); \
2632 break; \
2633 case _MM_SHUFFLE(2, 1, 0, 3): \
2634 ret = _mm_shuffle_ps_2103(_a, _b); \
2635 break; \
2636 case _MM_SHUFFLE(1, 0, 1, 0): \
2637 ret = _mm_movelh_ps(_a, _b); \
2638 break; \
2639 case _MM_SHUFFLE(1, 0, 0, 1): \
2640 ret = _mm_shuffle_ps_1001(_a, _b); \
2641 break; \
2642 case _MM_SHUFFLE(0, 1, 0, 1): \
2643 ret = _mm_shuffle_ps_0101(_a, _b); \
2644 break; \
2645 case _MM_SHUFFLE(3, 2, 1, 0): \
2646 ret = _mm_shuffle_ps_3210(_a, _b); \
2647 break; \
2648 case _MM_SHUFFLE(0, 0, 1, 1): \
2649 ret = _mm_shuffle_ps_0011(_a, _b); \
2650 break; \
2651 case _MM_SHUFFLE(0, 0, 2, 2): \
2652 ret = _mm_shuffle_ps_0022(_a, _b); \
2653 break; \
2654 case _MM_SHUFFLE(2, 2, 0, 0): \
2655 ret = _mm_shuffle_ps_2200(_a, _b); \
2656 break; \
2657 case _MM_SHUFFLE(3, 2, 0, 2): \
2658 ret = _mm_shuffle_ps_3202(_a, _b); \
2659 break; \
2660 case _MM_SHUFFLE(3, 2, 3, 2): \
2661 ret = _mm_movehl_ps(_b, _a); \
2662 break; \
2663 case _MM_SHUFFLE(1, 1, 3, 3): \
2664 ret = _mm_shuffle_ps_1133(_a, _b); \
2665 break; \
2666 case _MM_SHUFFLE(2, 0, 1, 0): \
2667 ret = _mm_shuffle_ps_2010(_a, _b); \
2668 break; \
2669 case _MM_SHUFFLE(2, 0, 0, 1): \
2670 ret = _mm_shuffle_ps_2001(_a, _b); \
2671 break; \
2672 case _MM_SHUFFLE(2, 0, 3, 2): \
2673 ret = _mm_shuffle_ps_2032(_a, _b); \
2674 break; \
2675 default: \
2676 ret = _mm_shuffle_ps_default(_a, _b, (imm)); \
2677 break; \
2678 } _sse2neon_return(ret);)
2679#endif
2680
2681// Compute the square root of packed single-precision (32-bit) floating-point
2682// elements in a, and store the results in dst.
2683// Due to ARMv7-A NEON's lack of a precise square root intrinsic, we implement
2684// square root by multiplying input in with its reciprocal square root before
2685// using the Newton-Raphson method to approximate the results.
2686// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
2687FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
2688{
2689#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) && \
2690 !SSE2NEON_PRECISE_SQRT
2691 return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
2692#else
2693 float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2694
2695 // Test for vrsqrteq_f32(0) -> positive infinity case.
2696 // Change to zero, so that s * 1/sqrt(s) result is zero too.
2697 const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2698 const uint32x4_t div_by_zero =
2699 vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
2700 recip = vreinterpretq_f32_u32(
2701 vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
2702
2703 recip = vmulq_f32(
2704 vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2705 recip);
2706 // Additional Netwon-Raphson iteration for accuracy
2707 recip = vmulq_f32(
2708 vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2709 recip);
2710
2711 // sqrt(s) = s * 1/sqrt(s)
2712 return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
2713#endif
2714}
2715
2716// Compute the square root of the lower single-precision (32-bit) floating-point
2717// element in a, store the result in the lower element of dst, and copy the
2718// upper 3 packed elements from a to the upper elements of dst.
2719// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss
2720FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
2721{
2722 float32_t value =
2723 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
2725 vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
2726}
2727
2728// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
2729// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
2730// or a general-protection exception may be generated.
2731// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps
2732FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
2733{
2734 vst1q_f32(p, vreinterpretq_f32_m128(a));
2735}
2736
2737// Store the lower single-precision (32-bit) floating-point element from a into
2738// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2739// boundary or a general-protection exception may be generated.
2740// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
2741FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
2742{
2743 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
2744 vst1q_f32(p, vdupq_n_f32(a0));
2745}
2746
2747// Store the lower single-precision (32-bit) floating-point element from a into
2748// memory. mem_addr does not need to be aligned on any particular boundary.
2749// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss
2750FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
2751{
2752 vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
2753}
2754
2755// Store the lower single-precision (32-bit) floating-point element from a into
2756// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2757// boundary or a general-protection exception may be generated.
2758// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
2759#define _mm_store1_ps _mm_store_ps1
2760
2761// Store the upper 2 single-precision (32-bit) floating-point elements from a
2762// into memory.
2763// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi
2764FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
2765{
2766 *p = vreinterpret_m64_f32(vget_high_f32(a));
2767}
2768
2769// Store the lower 2 single-precision (32-bit) floating-point elements from a
2770// into memory.
2771// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi
2772FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
2773{
2774 *p = vreinterpret_m64_f32(vget_low_f32(a));
2775}
2776
2777// Store 4 single-precision (32-bit) floating-point elements from a into memory
2778// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
2779// general-protection exception may be generated.
2780// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
2781FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
2782{
2783 float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
2784 float32x4_t rev = vextq_f32(tmp, tmp, 2);
2785 vst1q_f32(p, rev);
2786}
2787
2788// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
2789// elements) from a into memory. mem_addr does not need to be aligned on any
2790// particular boundary.
2791// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
2792FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
2793{
2794 vst1q_f32(p, vreinterpretq_f32_m128(a));
2795}
2796
2797// Stores 16-bits of integer data a at the address p.
2798// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
2799FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
2800{
2801 vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
2802}
2803
2804// Stores 64-bits of integer data a at the address p.
2805// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
2806FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
2807{
2808 vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
2809}
2810
2811// Store 64-bits of integer data from a into memory using a non-temporal memory
2812// hint.
2813// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
2814FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
2815{
2816 vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
2817}
2818
2819// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
2820// point elements) from a into memory using a non-temporal memory hint.
2821// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
2822FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
2823{
2824#if __has_builtin(__builtin_nontemporal_store)
2825 __builtin_nontemporal_store(a, (float32x4_t *) p);
2826#else
2827 vst1q_f32(p, vreinterpretq_f32_m128(a));
2828#endif
2829}
2830
2831// Subtract packed single-precision (32-bit) floating-point elements in b from
2832// packed single-precision (32-bit) floating-point elements in a, and store the
2833// results in dst.
2834// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps
2835FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
2836{
2839}
2840
2841// Subtract the lower single-precision (32-bit) floating-point element in b from
2842// the lower single-precision (32-bit) floating-point element in a, store the
2843// result in the lower element of dst, and copy the upper 3 packed elements from
2844// a to the upper elements of dst.
2845// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
2846FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
2847{
2848 return _mm_move_ss(a, _mm_sub_ps(a, b));
2849}
2850
2851// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
2852// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
2853// transposed matrix in these vectors (row0 now contains column 0, etc.).
2854// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
2855#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2856 do { \
2857 float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
2858 float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
2859 row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
2860 vget_low_f32(ROW23.val[0])); \
2861 row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
2862 vget_low_f32(ROW23.val[1])); \
2863 row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
2864 vget_high_f32(ROW23.val[0])); \
2865 row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
2866 vget_high_f32(ROW23.val[1])); \
2867 } while (0)
2868
2869// according to the documentation, these intrinsics behave the same as the
2870// non-'u' versions. We'll just alias them here.
2871#define _mm_ucomieq_ss _mm_comieq_ss
2872#define _mm_ucomige_ss _mm_comige_ss
2873#define _mm_ucomigt_ss _mm_comigt_ss
2874#define _mm_ucomile_ss _mm_comile_ss
2875#define _mm_ucomilt_ss _mm_comilt_ss
2876#define _mm_ucomineq_ss _mm_comineq_ss
2877
2878// Return vector of type __m128i with undefined elements.
2879// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
2881{
2882#if defined(__GNUC__) || defined(__clang__)
2883#pragma GCC diagnostic push
2884#pragma GCC diagnostic ignored "-Wuninitialized"
2885#endif
2886 __m128i a;
2887#if defined(_MSC_VER)
2888 a = _mm_setzero_si128();
2889#endif
2890 return a;
2891#if defined(__GNUC__) || defined(__clang__)
2892#pragma GCC diagnostic pop
2893#endif
2894}
2895
2896// Return vector of type __m128 with undefined elements.
2897// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
2898FORCE_INLINE __m128 _mm_undefined_ps(void)
2899{
2900#if defined(__GNUC__) || defined(__clang__)
2901#pragma GCC diagnostic push
2902#pragma GCC diagnostic ignored "-Wuninitialized"
2903#endif
2904 __m128 a;
2905#if defined(_MSC_VER)
2906 a = _mm_setzero_ps();
2907#endif
2908 return a;
2909#if defined(__GNUC__) || defined(__clang__)
2910#pragma GCC diagnostic pop
2911#endif
2912}
2913
2914// Unpack and interleave single-precision (32-bit) floating-point elements from
2915// the high half a and b, and store the results in dst.
2916// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
2918{
2919#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
2922#else
2923 float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
2924 float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
2925 float32x2x2_t result = vzip_f32(a1, b1);
2926 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2927#endif
2928}
2929
2930// Unpack and interleave single-precision (32-bit) floating-point elements from
2931// the low half of a and b, and store the results in dst.
2932// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
2934{
2935#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
2938#else
2939 float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
2940 float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
2941 float32x2x2_t result = vzip_f32(a1, b1);
2942 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2943#endif
2944}
2945
2946// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
2947// elements in a and b, and store the results in dst.
2948// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps
2949FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
2950{
2953}
2954
2955/* SSE2 */
2956
2957// Add packed 16-bit integers in a and b, and store the results in dst.
2958// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16
2960{
2963}
2964
2965// Add packed 32-bit integers in a and b, and store the results in dst.
2966// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
2968{
2971}
2972
2973// Add packed 64-bit integers in a and b, and store the results in dst.
2974// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64
2976{
2979}
2980
2981// Add packed 8-bit integers in a and b, and store the results in dst.
2982// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8
2984{
2987}
2988
2989// Add packed double-precision (64-bit) floating-point elements in a and b, and
2990// store the results in dst.
2991// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
2993{
2994#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
2995 return vreinterpretq_m128d_f64(
2996 vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
2997#else
2998 double a0 =
2999 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
3000 double a1 =
3001 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
3002 double b0 =
3003 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
3004 double b1 =
3005 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
3006 double c[2];
3007 c[0] = a0 + b0;
3008 c[1] = a1 + b1;
3009 return vld1q_f32((float32_t *) c);
3010#endif
3011}
3012
3013// Add the lower double-precision (64-bit) floating-point element in a and b,
3014// store the result in the lower element of dst, and copy the upper element from
3015// a to the upper element of dst.
3016// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
3018{
3019#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3020 return _mm_move_sd(a, _mm_add_pd(a, b));
3021#else
3022 double a0, a1, b0;
3023 a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
3024 a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
3025 b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
3026 double c[2];
3027 c[0] = a0 + b0;
3028 c[1] = a1;
3029 return vld1q_f32((float32_t *) c);
3030#endif
3031}
3032
3033// Add 64-bit integers a and b, and store the result in dst.
3034// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
3035FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
3036{
3037 return vreinterpret_m64_s64(
3039}
3040
3041// Add packed signed 16-bit integers in a and b using saturation, and store the
3042// results in dst.
3043// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16
3045{
3048}
3049
3050// Add packed signed 8-bit integers in a and b using saturation, and store the
3051// results in dst.
3052// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
3054{
3057}
3058
3059// Add packed unsigned 16-bit integers in a and b using saturation, and store
3060// the results in dst.
3061// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
3063{
3066}
3067
3068// Add packed unsigned 8-bit integers in a and b using saturation, and store the
3069// results in dst.
3070// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8
3072{
3075}
3076
3077// Compute the bitwise AND of packed double-precision (64-bit) floating-point
3078// elements in a and b, and store the results in dst.
3079// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
3081{
3084}
3085
3086// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
3087// and store the result in dst.
3088// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
3090{
3093}
3094
3095// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
3096// elements in a and then AND with b, and store the results in dst.
3097// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
3099{
3100 // *NOTE* argument swap
3103}
3104
3105// Compute the bitwise NOT of 128 bits (representing integer data) in a and then
3106// AND with b, and store the result in dst.
3107// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
3109{
3111 vbicq_s32(vreinterpretq_s32_m128i(b),
3112 vreinterpretq_s32_m128i(a))); // *NOTE* argument swap
3113}
3114
3115// Average packed unsigned 16-bit integers in a and b, and store the results in
3116// dst.
3117// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16
3119{
3120 return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
3122}
3123
3124// Average packed unsigned 8-bit integers in a and b, and store the results in
3125// dst.
3126// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8
3128{
3131}
3132
3133// Shift a left by imm8 bytes while shifting in zeros, and store the results in
3134// dst.
3135// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
3136#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
3137
3138// Shift a right by imm8 bytes while shifting in zeros, and store the results in
3139// dst.
3140// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
3141#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
3142
3143// Cast vector of type __m128d to type __m128. This intrinsic is only used for
3144// compilation and does not generate any instructions, thus it has zero latency.
3145// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
3150
3151// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
3152// compilation and does not generate any instructions, thus it has zero latency.
3153// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
3158
3159// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
3160// compilation and does not generate any instructions, thus it has zero latency.
3161// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
3166
3167// Cast vector of type __m128 to type __m128i. This intrinsic is only used for
3168// compilation and does not generate any instructions, thus it has zero latency.
3169// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128
3174
3175// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
3176// compilation and does not generate any instructions, thus it has zero latency.
3177// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
3179{
3180#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3181 return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
3182#else
3184#endif
3185}
3186
3187// Cast vector of type __m128i to type __m128. This intrinsic is only used for
3188// compilation and does not generate any instructions, thus it has zero latency.
3189// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
3194
3195// Invalidate and flush the cache line that contains p from all levels of the
3196// cache hierarchy.
3197// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush
3198#if defined(__APPLE__)
3199#include <libkern/OSCacheControl.h>
3200#endif
3201FORCE_INLINE void _mm_clflush(void const *p)
3202{
3203 (void) p;
3204
3205 /* sys_icache_invalidate is supported since macOS 10.5.
3206 * However, it does not work on non-jailbroken iOS devices, although the
3207 * compilation is successful.
3208 */
3209#if defined(__APPLE__)
3210 sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE);
3211#elif defined(__GNUC__) || defined(__clang__)
3212 uintptr_t ptr = (uintptr_t) p;
3213 __builtin___clear_cache((char *) ptr,
3214 (char *) ptr + SSE2NEON_CACHELINE_SIZE);
3215#elif (_MSC_VER) && SSE2NEON_INCLUDE_WINDOWS_H
3216 FlushInstructionCache(GetCurrentProcess(), p, SSE2NEON_CACHELINE_SIZE);
3217#endif
3218}
3219
3220// Compare packed 16-bit integers in a and b for equality, and store the results
3221// in dst.
3222// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16
3224{
3227}
3228
3229// Compare packed 32-bit integers in a and b for equality, and store the results
3230// in dst.
3231// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32
3233{
3236}
3237
3238// Compare packed 8-bit integers in a and b for equality, and store the results
3239// in dst.
3240// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8
3242{
3245}
3246
3247// Compare packed double-precision (64-bit) floating-point elements in a and b
3248// for equality, and store the results in dst.
3249// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
3251{
3252#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3254 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3255#else
3256 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3257 uint32x4_t cmp =
3259 uint32x4_t swapped = vrev64q_u32(cmp);
3260 return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
3261#endif
3262}
3263
3264// Compare the lower double-precision (64-bit) floating-point elements in a and
3265// b for equality, store the result in the lower element of dst, and copy the
3266// upper element from a to the upper element of dst.
3267// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
3269{
3270 return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
3271}
3272
3273// Compare packed double-precision (64-bit) floating-point elements in a and b
3274// for greater-than-or-equal, and store the results in dst.
3275// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
3277{
3278#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3280 vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3281#else
3282 double a0 =
3283 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
3284 double a1 =
3285 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
3286 double b0 =
3287 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
3288 double b1 =
3289 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
3290 uint64_t d[2];
3291 d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0);
3292 d[1] = a1 >= b1 ? ~UINT64_C(0) : UINT64_C(0);
3293
3294 return vreinterpretq_m128d_u64(vld1q_u64(d));
3295#endif
3296}
3297
3298// Compare the lower double-precision (64-bit) floating-point elements in a and
3299// b for greater-than-or-equal, store the result in the lower element of dst,
3300// and copy the upper element from a to the upper element of dst.
3301// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
3303{
3304#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3305 return _mm_move_sd(a, _mm_cmpge_pd(a, b));
3306#else
3307 // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3308 double a0, b0;
3309 a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
3310 uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
3311 b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
3312 uint64_t d[2];
3313 d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0);
3314 d[1] = a1;
3315
3316 return vreinterpretq_m128d_u64(vld1q_u64(d));
3317#endif
3318}
3319
3320// Compare packed signed 16-bit integers in a and b for greater-than, and store
3321// the results in dst.
3322// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16
3324{
3327}
3328
3329// Compare packed signed 32-bit integers in a and b for greater-than, and store
3330// the results in dst.
3331// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32
3333{
3336}
3337
3338// Compare packed signed 8-bit integers in a and b for greater-than, and store
3339// the results in dst.
3340// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8
3342{
3345}
3346
3347// Compare packed double-precision (64-bit) floating-point elements in a and b
3348// for greater-than, and store the results in dst.
3349// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
3351{
3352#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3354 vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3355#else
3356 double a0 =
3357 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
3358 double a1 =
3359 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
3360 double b0 =
3361 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
3362 double b1 =
3363 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
3364 uint64_t d[2];
3365 d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0);
3366 d[1] = a1 > b1 ? ~UINT64_C(0) : UINT64_C(0);
3367
3368 return vreinterpretq_m128d_u64(vld1q_u64(d));
3369#endif
3370}
3371
3372// Compare the lower double-precision (64-bit) floating-point elements in a and
3373// b for greater-than, store the result in the lower element of dst, and copy
3374// the upper element from a to the upper element of dst.
3375// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
3377{
3378#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3379 return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
3380#else
3381 // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3382 double a0, b0;
3383 a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
3384 uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
3385 b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
3386 uint64_t d[2];
3387 d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0);
3388 d[1] = a1;
3389
3390 return vreinterpretq_m128d_u64(vld1q_u64(d));
3391#endif
3392}
3393
3394// Compare packed double-precision (64-bit) floating-point elements in a and b
3395// for less-than-or-equal, and store the results in dst.
3396// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
3398{
3399#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3401 vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3402#else
3403 double a0 =
3404 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
3405 double a1 =
3406 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
3407 double b0 =
3408 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
3409 double b1 =
3410 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
3411 uint64_t d[2];
3412 d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0);
3413 d[1] = a1 <= b1 ? ~UINT64_C(0) : UINT64_C(0);
3414
3415 return vreinterpretq_m128d_u64(vld1q_u64(d));
3416#endif
3417}
3418
3419// Compare the lower double-precision (64-bit) floating-point elements in a and
3420// b for less-than-or-equal, store the result in the lower element of dst, and
3421// copy the upper element from a to the upper element of dst.
3422// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
3424{
3425#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3426 return _mm_move_sd(a, _mm_cmple_pd(a, b));
3427#else
3428 // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3429 double a0, b0;
3430 a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
3431 uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
3432 b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
3433 uint64_t d[2];
3434 d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0);
3435 d[1] = a1;
3436
3437 return vreinterpretq_m128d_u64(vld1q_u64(d));
3438#endif
3439}
3440
3441// Compare packed signed 16-bit integers in a and b for less-than, and store the
3442// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
3443// order of the operands switched.
3444// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16
3446{
3449}
3450
3451// Compare packed signed 32-bit integers in a and b for less-than, and store the
3452// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the
3453// order of the operands switched.
3454// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32
3456{
3459}
3460
3461// Compare packed signed 8-bit integers in a and b for less-than, and store the
3462// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the
3463// order of the operands switched.
3464// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8
3466{
3469}
3470
3471// Compare packed double-precision (64-bit) floating-point elements in a and b
3472// for less-than, and store the results in dst.
3473// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
3475{
3476#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3478 vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3479#else
3480 double a0 =
3481 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
3482 double a1 =
3483 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
3484 double b0 =
3485 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
3486 double b1 =
3487 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
3488 uint64_t d[2];
3489 d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0);
3490 d[1] = a1 < b1 ? ~UINT64_C(0) : UINT64_C(0);
3491
3492 return vreinterpretq_m128d_u64(vld1q_u64(d));
3493#endif
3494}
3495
3496// Compare the lower double-precision (64-bit) floating-point elements in a and
3497// b for less-than, store the result in the lower element of dst, and copy the
3498// upper element from a to the upper element of dst.
3499// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
3501{
3502#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3503 return _mm_move_sd(a, _mm_cmplt_pd(a, b));
3504#else
3505 double a0, b0;
3506 a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
3507 uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
3508 b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
3509 uint64_t d[2];
3510 d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0);
3511 d[1] = a1;
3512
3513 return vreinterpretq_m128d_u64(vld1q_u64(d));
3514#endif
3515}
3516
3517// Compare packed double-precision (64-bit) floating-point elements in a and b
3518// for not-equal, and store the results in dst.
3519// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
3521{
3522#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3523 return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
3524 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
3525#else
3526 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3527 uint32x4_t cmp =
3529 uint32x4_t swapped = vrev64q_u32(cmp);
3530 return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
3531#endif
3532}
3533
3534// Compare the lower double-precision (64-bit) floating-point elements in a and
3535// b for not-equal, store the result in the lower element of dst, and copy the
3536// upper element from a to the upper element of dst.
3537// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
3539{
3540 return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
3541}
3542
3543// Compare packed double-precision (64-bit) floating-point elements in a and b
3544// for not-greater-than-or-equal, and store the results in dst.
3545// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
3547{
3548#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3549 return vreinterpretq_m128d_u64(veorq_u64(
3550 vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3551 vdupq_n_u64(UINT64_MAX)));
3552#else
3553 double a0 =
3554 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
3555 double a1 =
3556 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
3557 double b0 =
3558 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
3559 double b1 =
3560 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
3561 uint64_t d[2];
3562 d[0] = !(a0 >= b0) ? ~UINT64_C(0) : UINT64_C(0);
3563 d[1] = !(a1 >= b1) ? ~UINT64_C(0) : UINT64_C(0);
3564
3565 return vreinterpretq_m128d_u64(vld1q_u64(d));
3566#endif
3567}
3568
3569// Compare the lower double-precision (64-bit) floating-point elements in a and
3570// b for not-greater-than-or-equal, store the result in the lower element of
3571// dst, and copy the upper element from a to the upper element of dst.
3572// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
3574{
3575 return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
3576}
3577
3578// Compare packed double-precision (64-bit) floating-point elements in a and b
3579// for not-greater-than, and store the results in dst.
3580// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
3582{
3583#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3584 return vreinterpretq_m128d_u64(veorq_u64(
3585 vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3586 vdupq_n_u64(UINT64_MAX)));
3587#else
3588 double a0 =
3589 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
3590 double a1 =
3591 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
3592 double b0 =
3593 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
3594 double b1 =
3595 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
3596 uint64_t d[2];
3597 d[0] = !(a0 > b0) ? ~UINT64_C(0) : UINT64_C(0);
3598 d[1] = !(a1 > b1) ? ~UINT64_C(0) : UINT64_C(0);
3599
3600 return vreinterpretq_m128d_u64(vld1q_u64(d));
3601#endif
3602}
3603
3604// Compare the lower double-precision (64-bit) floating-point elements in a and
3605// b for not-greater-than, store the result in the lower element of dst, and
3606// copy the upper element from a to the upper element of dst.
3607// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
3609{
3610 return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
3611}
3612
3613// Compare packed double-precision (64-bit) floating-point elements in a and b
3614// for not-less-than-or-equal, and store the results in dst.
3615// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
3617{
3618#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3619 return vreinterpretq_m128d_u64(veorq_u64(
3620 vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3621 vdupq_n_u64(UINT64_MAX)));
3622#else
3623 double a0 =
3624 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
3625 double a1 =
3626 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
3627 double b0 =
3628 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
3629 double b1 =
3630 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
3631 uint64_t d[2];
3632 d[0] = !(a0 <= b0) ? ~UINT64_C(0) : UINT64_C(0);
3633 d[1] = !(a1 <= b1) ? ~UINT64_C(0) : UINT64_C(0);
3634
3635 return vreinterpretq_m128d_u64(vld1q_u64(d));
3636#endif
3637}
3638
3639// Compare the lower double-precision (64-bit) floating-point elements in a and
3640// b for not-less-than-or-equal, store the result in the lower element of dst,
3641// and copy the upper element from a to the upper element of dst.
3642// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
3644{
3645 return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
3646}
3647
3648// Compare packed double-precision (64-bit) floating-point elements in a and b
3649// for not-less-than, and store the results in dst.
3650// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
3652{
3653#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3654 return vreinterpretq_m128d_u64(veorq_u64(
3655 vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3656 vdupq_n_u64(UINT64_MAX)));
3657#else
3658 double a0 =
3659 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
3660 double a1 =
3661 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
3662 double b0 =
3663 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
3664 double b1 =
3665 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
3666 uint64_t d[2];
3667 d[0] = !(a0 < b0) ? ~UINT64_C(0) : UINT64_C(0);
3668 d[1] = !(a1 < b1) ? ~UINT64_C(0) : UINT64_C(0);
3669
3670 return vreinterpretq_m128d_u64(vld1q_u64(d));
3671#endif
3672}
3673
3674// Compare the lower double-precision (64-bit) floating-point elements in a and
3675// b for not-less-than, store the result in the lower element of dst, and copy
3676// the upper element from a to the upper element of dst.
3677// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
3679{
3680 return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
3681}
3682
3683// Compare packed double-precision (64-bit) floating-point elements in a and b
3684// to see if neither is NaN, and store the results in dst.
3685// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
3687{
3688#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3689 // Excluding NaNs, any two floating point numbers can be compared.
3690 uint64x2_t not_nan_a =
3691 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3692 uint64x2_t not_nan_b =
3693 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3694 return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
3695#else
3696 double a0 =
3697 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
3698 double a1 =
3699 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
3700 double b0 =
3701 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
3702 double b1 =
3703 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
3704 uint64_t d[2];
3705 d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0);
3706 d[1] = (a1 == a1 && b1 == b1) ? ~UINT64_C(0) : UINT64_C(0);
3707
3708 return vreinterpretq_m128d_u64(vld1q_u64(d));
3709#endif
3710}
3711
3712// Compare the lower double-precision (64-bit) floating-point elements in a and
3713// b to see if neither is NaN, store the result in the lower element of dst, and
3714// copy the upper element from a to the upper element of dst.
3715// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
3717{
3718#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3719 return _mm_move_sd(a, _mm_cmpord_pd(a, b));
3720#else
3721 double a0, b0;
3722 a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
3723 uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
3724 b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
3725 uint64_t d[2];
3726 d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0);
3727 d[1] = a1;
3728
3729 return vreinterpretq_m128d_u64(vld1q_u64(d));
3730#endif
3731}
3732
3733// Compare packed double-precision (64-bit) floating-point elements in a and b
3734// to see if either is NaN, and store the results in dst.
3735// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
3737{
3738#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3739 // Two NaNs are not equal in comparison operation.
3740 uint64x2_t not_nan_a =
3741 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3742 uint64x2_t not_nan_b =
3743 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3745 vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
3746#else
3747 double a0 =
3748 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
3749 double a1 =
3750 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
3751 double b0 =
3752 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
3753 double b1 =
3754 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
3755 uint64_t d[2];
3756 d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0);
3757 d[1] = (a1 == a1 && b1 == b1) ? UINT64_C(0) : ~UINT64_C(0);
3758
3759 return vreinterpretq_m128d_u64(vld1q_u64(d));
3760#endif
3761}
3762
3763// Compare the lower double-precision (64-bit) floating-point elements in a and
3764// b to see if either is NaN, store the result in the lower element of dst, and
3765// copy the upper element from a to the upper element of dst.
3766// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
3768{
3769#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3770 return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
3771#else
3772 double a0, b0;
3773 a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
3774 uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
3775 b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
3776 uint64_t d[2];
3777 d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0);
3778 d[1] = a1;
3779
3780 return vreinterpretq_m128d_u64(vld1q_u64(d));
3781#endif
3782}
3783
3784// Compare the lower double-precision (64-bit) floating-point element in a and b
3785// for greater-than-or-equal, and return the boolean result (0 or 1).
3786// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
3787FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
3788{
3789#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3790 return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
3791#else
3792 double a0, b0;
3793 a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
3794 b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
3795 return a0 >= b0;
3796#endif
3797}
3798
3799// Compare the lower double-precision (64-bit) floating-point element in a and b
3800// for greater-than, and return the boolean result (0 or 1).
3801// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
3802FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
3803{
3804#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3805 return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
3806#else
3807 double a0, b0;
3808 a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
3809 b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
3810
3811 return a0 > b0;
3812#endif
3813}
3814
3815// Compare the lower double-precision (64-bit) floating-point element in a and b
3816// for less-than-or-equal, and return the boolean result (0 or 1).
3817// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
3818FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
3819{
3820#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3821 return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
3822#else
3823 double a0, b0;
3824 a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
3825 b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
3826
3827 return a0 <= b0;
3828#endif
3829}
3830
3831// Compare the lower double-precision (64-bit) floating-point element in a and b
3832// for less-than, and return the boolean result (0 or 1).
3833// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
3834FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
3835{
3836#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3837 return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
3838#else
3839 double a0, b0;
3840 a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
3841 b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
3842
3843 return a0 < b0;
3844#endif
3845}
3846
3847// Compare the lower double-precision (64-bit) floating-point element in a and b
3848// for equality, and return the boolean result (0 or 1).
3849// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
3850FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
3851{
3852#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3853 return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
3854#else
3855 uint32x4_t a_not_nan =
3857 uint32x4_t b_not_nan =
3859 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
3860 uint32x4_t a_eq_b =
3862 uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
3863 vreinterpretq_u64_u32(a_eq_b));
3864 return vgetq_lane_u64(and_results, 0) & 0x1;
3865#endif
3866}
3867
3868// Compare the lower double-precision (64-bit) floating-point element in a and b
3869// for not-equal, and return the boolean result (0 or 1).
3870// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
3871FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
3872{
3873 return !_mm_comieq_sd(a, b);
3874}
3875
3876// Convert packed signed 32-bit integers in a to packed double-precision
3877// (64-bit) floating-point elements, and store the results in dst.
3878// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
3880{
3881#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3882 return vreinterpretq_m128d_f64(
3883 vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
3884#else
3885 double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
3886 double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
3887 return _mm_set_pd(a1, a0);
3888#endif
3889}
3890
3891// Convert packed signed 32-bit integers in a to packed single-precision
3892// (32-bit) floating-point elements, and store the results in dst.
3893// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
3895{
3896 return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
3897}
3898
3899// Convert packed double-precision (64-bit) floating-point elements in a to
3900// packed 32-bit integers, and store the results in dst.
3901// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
3903{
3904// vrnd32xq_f64 not supported on clang
3905#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
3906 float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
3907 int64x2_t integers = vcvtq_s64_f64(rounded);
3909 vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
3910#else
3912 double d0, d1;
3914 vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
3916 vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
3917 return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
3918#endif
3919}
3920
3921// Convert packed double-precision (64-bit) floating-point elements in a to
3922// packed 32-bit integers, and store the results in dst.
3923// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
3925{
3927 double d0, d1;
3929 vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
3931 vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
3932 int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
3933 return vreinterpret_m64_s32(vld1_s32(data));
3934}
3935
3936// Convert packed double-precision (64-bit) floating-point elements in a to
3937// packed single-precision (32-bit) floating-point elements, and store the
3938// results in dst.
3939// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
3941{
3942#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3943 float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
3944 return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
3945#else
3946 double a0, a1;
3947 a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
3948 a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
3949 return _mm_set_ps(0, 0, (float) a1, (float) a0);
3950#endif
3951}
3952
3953// Convert packed signed 32-bit integers in a to packed double-precision
3954// (64-bit) floating-point elements, and store the results in dst.
3955// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
3957{
3958#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3959 return vreinterpretq_m128d_f64(
3960 vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
3961#else
3962 double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
3963 double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
3964 return _mm_set_pd(a1, a0);
3965#endif
3966}
3967
3968// Convert packed single-precision (32-bit) floating-point elements in a to
3969// packed 32-bit integers, and store the results in dst.
3970// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
3971// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
3972// does not support! It is supported on ARMv8-A however.
3974{
3975#if defined(__ARM_FEATURE_FRINT)
3976 return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
3977#elif (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
3978 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
3979 switch (_MM_GET_ROUNDING_MODE()) {
3980 case _MM_ROUND_NEAREST:
3981 return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
3982 case _MM_ROUND_DOWN:
3983 return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
3984 case _MM_ROUND_UP:
3985 return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
3986 default: // _MM_ROUND_TOWARD_ZERO
3987 return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
3988 }
3989#else
3990 float *f = (float *) &a;
3991 switch (_MM_GET_ROUNDING_MODE()) {
3992 case _MM_ROUND_NEAREST: {
3993 uint32x4_t signmask = vdupq_n_u32(0x80000000);
3994 float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
3995 vdupq_n_f32(0.5f)); /* +/- 0.5 */
3996 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
3997 vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
3998 int32x4_t r_trunc = vcvtq_s32_f32(
3999 vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
4000 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
4001 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
4002 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
4003 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
4004 float32x4_t delta = vsubq_f32(
4006 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
4007 uint32x4_t is_delta_half =
4008 vceqq_f32(delta, half); /* delta == +/- 0.5 */
4010 vbslq_s32(is_delta_half, r_even, r_normal));
4011 }
4012 case _MM_ROUND_DOWN:
4013 return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
4014 floorf(f[0]));
4015 case _MM_ROUND_UP:
4016 return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
4017 ceilf(f[0]));
4018 default: // _MM_ROUND_TOWARD_ZERO
4019 return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
4020 (int32_t) f[0]);
4021 }
4022#endif
4023}
4024
4025// Convert packed single-precision (32-bit) floating-point elements in a to
4026// packed double-precision (64-bit) floating-point elements, and store the
4027// results in dst.
4028// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
4030{
4031#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4032 return vreinterpretq_m128d_f64(
4033 vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
4034#else
4035 double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
4036 double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
4037 return _mm_set_pd(a1, a0);
4038#endif
4039}
4040
4041// Copy the lower double-precision (64-bit) floating-point element of a to dst.
4042// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
4043FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
4044{
4045#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4046 return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
4047#else
4048 double _a =
4049 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
4050 return _a;
4051#endif
4052}
4053
4054// Convert the lower double-precision (64-bit) floating-point element in a to a
4055// 32-bit integer, and store the result in dst.
4056// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
4057FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
4058{
4059#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4060 return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
4061#else
4063 double ret = sse2neon_recast_u64_f64(
4064 vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
4065 return (int32_t) ret;
4066#endif
4067}
4068
4069// Convert the lower double-precision (64-bit) floating-point element in a to a
4070// 64-bit integer, and store the result in dst.
4071// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
4072FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
4073{
4074#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4075 return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
4076#else
4078 double ret = sse2neon_recast_u64_f64(
4079 vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
4080 return (int64_t) ret;
4081#endif
4082}
4083
4084// Convert the lower double-precision (64-bit) floating-point element in a to a
4085// 64-bit integer, and store the result in dst.
4086// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
4087#define _mm_cvtsd_si64x _mm_cvtsd_si64
4088
4089// Convert the lower double-precision (64-bit) floating-point element in b to a
4090// single-precision (32-bit) floating-point element, store the result in the
4091// lower element of dst, and copy the upper 3 packed elements from a to the
4092// upper elements of dst.
4093// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
4095{
4096#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4097 return vreinterpretq_m128_f32(vsetq_lane_f32(
4098 vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
4099 vreinterpretq_f32_m128(a), 0));
4100#else
4101 double b0 =
4102 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
4104 vsetq_lane_f32((float) b0, vreinterpretq_f32_m128(a), 0));
4105#endif
4106}
4107
4108// Copy the lower 32-bit integer in a to dst.
4109// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
4110FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
4111{
4112 return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
4113}
4114
4115// Copy the lower 64-bit integer in a to dst.
4116// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
4117FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
4118{
4119 return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
4120}
4121
4122// Copy the lower 64-bit integer in a to dst.
4123// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
4124#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4125
4126// Convert the signed 32-bit integer b to a double-precision (64-bit)
4127// floating-point element, store the result in the lower element of dst, and
4128// copy the upper element from a to the upper element of dst.
4129// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
4130FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
4131{
4132#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4133 return vreinterpretq_m128d_f64(
4134 vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4135#else
4136 int64_t _b = sse2neon_recast_f64_s64((double) b);
4138 vsetq_lane_s64(_b, vreinterpretq_s64_m128d(a), 0));
4139#endif
4140}
4141
4142// Copy the lower 64-bit integer in a to dst.
4143// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
4144#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4145
4146// Copy 32-bit integer a to the lower elements of dst, and zero the upper
4147// elements of dst.
4148// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128
4149FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
4150{
4151 return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
4152}
4153
4154// Convert the signed 64-bit integer b to a double-precision (64-bit)
4155// floating-point element, store the result in the lower element of dst, and
4156// copy the upper element from a to the upper element of dst.
4157// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
4158FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
4159{
4160#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4161 return vreinterpretq_m128d_f64(
4162 vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4163#else
4164 int64_t _b = sse2neon_recast_f64_s64((double) b);
4166 vsetq_lane_s64(_b, vreinterpretq_s64_m128d(a), 0));
4167#endif
4168}
4169
4170// Copy 64-bit integer a to the lower element of dst, and zero the upper
4171// element.
4172// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128
4173FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
4174{
4175 return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
4176}
4177
4178// Copy 64-bit integer a to the lower element of dst, and zero the upper
4179// element.
4180// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
4181#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
4182
4183// Convert the signed 64-bit integer b to a double-precision (64-bit)
4184// floating-point element, store the result in the lower element of dst, and
4185// copy the upper element from a to the upper element of dst.
4186// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
4187#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
4188
4189// Convert the lower single-precision (32-bit) floating-point element in b to a
4190// double-precision (64-bit) floating-point element, store the result in the
4191// lower element of dst, and copy the upper element from a to the upper element
4192// of dst.
4193// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
4195{
4196 double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
4197#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4198 return vreinterpretq_m128d_f64(
4199 vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
4200#else
4201 return vreinterpretq_m128d_s64(vsetq_lane_s64(
4203#endif
4204}
4205
4206// Convert packed double-precision (64-bit) floating-point elements in a to
4207// packed 32-bit integers with truncation, and store the results in dst.
4208// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
4210{
4211 double a0, a1;
4212 a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
4213 a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
4214 return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
4215}
4216
4217// Convert packed double-precision (64-bit) floating-point elements in a to
4218// packed 32-bit integers with truncation, and store the results in dst.
4219// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
4221{
4222 double a0, a1;
4223 a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
4224 a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
4225 int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
4226 return vreinterpret_m64_s32(vld1_s32(data));
4227}
4228
4229// Convert packed single-precision (32-bit) floating-point elements in a to
4230// packed 32-bit integers with truncation, and store the results in dst.
4231// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
4233{
4234 return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
4235}
4236
4237// Convert the lower double-precision (64-bit) floating-point element in a to a
4238// 32-bit integer with truncation, and store the result in dst.
4239// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
4240FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
4241{
4242 double _a =
4243 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
4244 return (int32_t) _a;
4245}
4246
4247// Convert the lower double-precision (64-bit) floating-point element in a to a
4248// 64-bit integer with truncation, and store the result in dst.
4249// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
4250FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
4251{
4252#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4253 return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
4254#else
4255 double _a =
4256 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
4257 return (int64_t) _a;
4258#endif
4259}
4260
4261// Convert the lower double-precision (64-bit) floating-point element in a to a
4262// 64-bit integer with truncation, and store the result in dst.
4263// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
4264#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
4265
4266// Divide packed double-precision (64-bit) floating-point elements in a by
4267// packed elements in b, and store the results in dst.
4268// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
4270{
4271#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4272 return vreinterpretq_m128d_f64(
4273 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4274#else
4275 double a0 =
4276 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
4277 double a1 =
4278 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
4279 double b0 =
4280 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
4281 double b1 =
4282 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
4283 double c[2];
4284 c[0] = a0 / b0;
4285 c[1] = a1 / b1;
4286 return vld1q_f32((float32_t *) c);
4287#endif
4288}
4289
4290// Divide the lower double-precision (64-bit) floating-point element in a by the
4291// lower double-precision (64-bit) floating-point element in b, store the result
4292// in the lower element of dst, and copy the upper element from a to the upper
4293// element of dst.
4294// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
4296{
4297#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4298 float64x2_t tmp =
4299 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
4300 return vreinterpretq_m128d_f64(
4301 vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
4302#else
4303 return _mm_move_sd(a, _mm_div_pd(a, b));
4304#endif
4305}
4306
4307// Extract a 16-bit integer from a, selected with imm8, and store the result in
4308// the lower element of dst.
4309// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16
4310// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
4311#define _mm_extract_epi16(a, imm) \
4312 vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
4313
4314// Copy a to dst, and insert the 16-bit integer i into dst at the location
4315// specified by imm8.
4316// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16
4317// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
4318// __constrange(0,8) int imm)
4319#define _mm_insert_epi16(a, b, imm) \
4320 vreinterpretq_m128i_s16( \
4321 vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm)))
4322
4323// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point
4324// elements) from memory into dst. mem_addr must be aligned on a 16-byte
4325// boundary or a general-protection exception may be generated.
4326// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
4327FORCE_INLINE __m128d _mm_load_pd(const double *p)
4328{
4329#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4330 return vreinterpretq_m128d_f64(vld1q_f64(p));
4331#else
4332 const float *fp = (const float *) p;
4333 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
4334 return vreinterpretq_m128d_f32(vld1q_f32(data));
4335#endif
4336}
4337
4338// Load a double-precision (64-bit) floating-point element from memory into both
4339// elements of dst.
4340// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
4341#define _mm_load_pd1 _mm_load1_pd
4342
4343// Load a double-precision (64-bit) floating-point element from memory into the
4344// lower of dst, and zero the upper element. mem_addr does not need to be
4345// aligned on any particular boundary.
4346// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
4347FORCE_INLINE __m128d _mm_load_sd(const double *p)
4348{
4349#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4350 return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
4351#else
4352 const float *fp = (const float *) p;
4353 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
4354 return vreinterpretq_m128d_f32(vld1q_f32(data));
4355#endif
4356}
4357
4358// Load 128-bits of integer data from memory into dst. mem_addr must be aligned
4359// on a 16-byte boundary or a general-protection exception may be generated.
4360// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128
4361FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
4362{
4363 return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4364}
4365
4366// Load a double-precision (64-bit) floating-point element from memory into both
4367// elements of dst.
4368// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
4369FORCE_INLINE __m128d _mm_load1_pd(const double *p)
4370{
4371#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4372 return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4373#else
4374 return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
4375#endif
4376}
4377
4378// Load a double-precision (64-bit) floating-point element from memory into the
4379// upper element of dst, and copy the lower element from a to dst. mem_addr does
4380// not need to be aligned on any particular boundary.
4381// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
4382FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
4383{
4384#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4385 return vreinterpretq_m128d_f64(
4386 vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4387#else
4388 return vreinterpretq_m128d_f32(vcombine_f32(
4389 vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
4390#endif
4391}
4392
4393// Load 64-bit integer from memory into the first element of dst.
4394// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
4395FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
4396{
4397 /* Load the lower 64 bits of the value pointed to by p into the
4398 * lower 64 bits of the result, zeroing the upper 64 bits of the result.
4399 */
4401 vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
4402}
4403
4404// Load a double-precision (64-bit) floating-point element from memory into the
4405// lower element of dst, and copy the upper element from a to dst. mem_addr does
4406// not need to be aligned on any particular boundary.
4407// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
4408FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
4409{
4410#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4411 return vreinterpretq_m128d_f64(
4412 vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
4413#else
4415 vcombine_f32(vld1_f32((const float *) p),
4416 vget_high_f32(vreinterpretq_f32_m128d(a))));
4417#endif
4418}
4419
4420// Load 2 double-precision (64-bit) floating-point elements from memory into dst
4421// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
4422// general-protection exception may be generated.
4423// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
4424FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
4425{
4426#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4427 float64x2_t v = vld1q_f64(p);
4428 return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
4429#else
4430 int64x2_t v = vld1q_s64((const int64_t *) p);
4431 return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
4432#endif
4433}
4434
4435// Loads two double-precision from unaligned memory, floating-point values.
4436// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
4437FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
4438{
4439 return _mm_load_pd(p);
4440}
4441
4442// Load 128-bits of integer data from memory into dst. mem_addr does not need to
4443// be aligned on any particular boundary.
4444// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
4445FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
4446{
4447 return vreinterpretq_m128i_s32(vld1q_s32((const unaligned_int32_t *) p));
4448}
4449
4450// Load unaligned 32-bit integer from memory into the first element of dst.
4451// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
4452FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
4453{
4455 vsetq_lane_s32(*(const unaligned_int32_t *) p, vdupq_n_s32(0), 0));
4456}
4457
4458// Multiply packed signed 16-bit integers in a and b, producing intermediate
4459// signed 32-bit integers. Horizontally add adjacent pairs of intermediate
4460// 32-bit integers, and pack the results in dst.
4461// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16
4463{
4464 int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
4465 vget_low_s16(vreinterpretq_s16_m128i(b)));
4466#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4467 int32x4_t high =
4468 vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
4469
4470 return vreinterpretq_m128i_s32(vpaddq_s32(low, high));
4471#else
4472 int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
4473 vget_high_s16(vreinterpretq_s16_m128i(b)));
4474
4475 int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
4476 int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
4477
4478 return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
4479#endif
4480}
4481
4482// Conditionally store 8-bit integer elements from a into memory using mask
4483// (elements are not stored when the highest bit is not set in the corresponding
4484// element) and a non-temporal memory hint. mem_addr does not need to be aligned
4485// on any particular boundary.
4486// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
4487FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
4488{
4489 int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
4490 __m128 b = _mm_load_ps((const float *) mem_addr);
4491 int8x16_t masked =
4492 vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
4494 vst1q_s8((int8_t *) mem_addr, masked);
4495}
4496
4497// Compare packed signed 16-bit integers in a and b, and store packed maximum
4498// values in dst.
4499// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16
4501{
4504}
4505
4506// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
4507// values in dst.
4508// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8
4510{
4513}
4514
4515// Compare packed double-precision (64-bit) floating-point elements in a and b,
4516// and store packed maximum values in dst.
4517// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
4519{
4520#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4521#if SSE2NEON_PRECISE_MINMAX
4522 float64x2_t _a = vreinterpretq_f64_m128d(a);
4523 float64x2_t _b = vreinterpretq_f64_m128d(b);
4524 return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
4525#else
4526 return vreinterpretq_m128d_f64(
4527 vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4528#endif
4529#else
4530 double a0 =
4531 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
4532 double a1 =
4533 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
4534 double b0 =
4535 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
4536 double b1 =
4537 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
4538 int64_t d[2];
4539 d[0] = a0 > b0 ? sse2neon_recast_f64_s64(a0) : sse2neon_recast_f64_s64(b0);
4540 d[1] = a1 > b1 ? sse2neon_recast_f64_s64(a1) : sse2neon_recast_f64_s64(b1);
4541
4542 return vreinterpretq_m128d_s64(vld1q_s64(d));
4543#endif
4544}
4545
4546// Compare the lower double-precision (64-bit) floating-point elements in a and
4547// b, store the maximum value in the lower element of dst, and copy the upper
4548// element from a to the upper element of dst.
4549// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
4551{
4552#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4553 return _mm_move_sd(a, _mm_max_pd(a, b));
4554#else
4555 double a0, a1, b0;
4556 a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
4557 a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
4558 b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
4559 double c[2] = {a0 > b0 ? a0 : b0, a1};
4560 return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4561#endif
4562}
4563
4564// Compare packed signed 16-bit integers in a and b, and store packed minimum
4565// values in dst.
4566// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16
4568{
4571}
4572
4573// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
4574// values in dst.
4575// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8
4577{
4580}
4581
4582// Compare packed double-precision (64-bit) floating-point elements in a and b,
4583// and store packed minimum values in dst.
4584// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
4586{
4587#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4588#if SSE2NEON_PRECISE_MINMAX
4589 float64x2_t _a = vreinterpretq_f64_m128d(a);
4590 float64x2_t _b = vreinterpretq_f64_m128d(b);
4591 return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
4592#else
4593 return vreinterpretq_m128d_f64(
4594 vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4595#endif
4596#else
4597 double a0 =
4598 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
4599 double a1 =
4600 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
4601 double b0 =
4602 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
4603 double b1 =
4604 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
4605 int64_t d[2];
4606 d[0] = a0 < b0 ? sse2neon_recast_f64_s64(a0) : sse2neon_recast_f64_s64(b0);
4607 d[1] = a1 < b1 ? sse2neon_recast_f64_s64(a1) : sse2neon_recast_f64_s64(b1);
4608 return vreinterpretq_m128d_s64(vld1q_s64(d));
4609#endif
4610}
4611
4612// Compare the lower double-precision (64-bit) floating-point elements in a and
4613// b, store the minimum value in the lower element of dst, and copy the upper
4614// element from a to the upper element of dst.
4615// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
4617{
4618#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4619 return _mm_move_sd(a, _mm_min_pd(a, b));
4620#else
4621 double a0, a1, b0;
4622 a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
4623 a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
4624 b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
4625 double c[2] = {a0 < b0 ? a0 : b0, a1};
4626 return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4627#endif
4628}
4629
4630// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
4631// upper element.
4632// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
4634{
4636 vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
4637}
4638
4639// Move the lower double-precision (64-bit) floating-point element from b to the
4640// lower element of dst, and copy the upper element from a to the upper element
4641// of dst.
4642// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
4644{
4646 vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
4647 vget_high_f32(vreinterpretq_f32_m128d(a))));
4648}
4649
4650// Create mask from the most significant bit of each 8-bit element in a, and
4651// store the result in dst.
4652// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8
4653FORCE_INLINE int _mm_movemask_epi8(__m128i a)
4654{
4655 // Use increasingly wide shifts+adds to collect the sign bits
4656 // together.
4657 // Since the widening shifts would be rather confusing to follow in little
4658 // endian, everything will be illustrated in big endian order instead. This
4659 // has a different result - the bits would actually be reversed on a big
4660 // endian machine.
4661
4662 // Starting input (only half the elements are shown):
4663 // 89 ff 1d c0 00 10 99 33
4664 uint8x16_t input = vreinterpretq_u8_m128i(a);
4665
4666 // Shift out everything but the sign bits with an unsigned shift right.
4667 //
4668 // Bytes of the vector::
4669 // 89 ff 1d c0 00 10 99 33
4670 // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
4671 // | | | | | | | |
4672 // 01 01 00 01 00 00 01 00
4673 //
4674 // Bits of first important lane(s):
4675 // 10001001 (89)
4676 // \______
4677 // |
4678 // 00000001 (01)
4679 uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
4680
4681 // Merge the even lanes together with a 16-bit unsigned shift right + add.
4682 // 'xx' represents garbage data which will be ignored in the final result.
4683 // In the important bytes, the add functions like a binary OR.
4684 //
4685 // 01 01 00 01 00 00 01 00
4686 // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
4687 // \| \| \| \|
4688 // xx 03 xx 01 xx 00 xx 02
4689 //
4690 // 00000001 00000001 (01 01)
4691 // \_______ |
4692 // \|
4693 // xxxxxxxx xxxxxx11 (xx 03)
4694 uint32x4_t paired16 =
4695 vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
4696
4697 // Repeat with a wider 32-bit shift + add.
4698 // xx 03 xx 01 xx 00 xx 02
4699 // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >>
4700 // 14))
4701 // \| \|
4702 // xx xx xx 0d xx xx xx 02
4703 //
4704 // 00000011 00000001 (03 01)
4705 // \\_____ ||
4706 // '----.\||
4707 // xxxxxxxx xxxx1101 (xx 0d)
4708 uint64x2_t paired32 =
4709 vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
4710
4711 // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
4712 // lanes. xx xx xx 0d xx xx xx 02
4713 // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >>
4714 // 28))
4715 // \|
4716 // xx xx xx xx xx xx xx d2
4717 //
4718 // 00001101 00000010 (0d 02)
4719 // \ \___ | |
4720 // '---. \| |
4721 // xxxxxxxx 11010010 (xx d2)
4722 uint8x16_t paired64 =
4723 vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
4724
4725 // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
4726 // xx xx xx xx xx xx xx d2
4727 // || return paired64[0]
4728 // d2
4729 // Note: Little endian would return the correct value 4b (01001011) instead.
4730 return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
4731}
4732
4733// Set each bit of mask dst based on the most significant bit of the
4734// corresponding packed double-precision (64-bit) floating-point element in a.
4735// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
4736FORCE_INLINE int _mm_movemask_pd(__m128d a)
4737{
4738 uint64x2_t input = vreinterpretq_u64_m128d(a);
4739 uint64x2_t high_bits = vshrq_n_u64(input, 63);
4740 return (int) (vgetq_lane_u64(high_bits, 0) |
4741 (vgetq_lane_u64(high_bits, 1) << 1));
4742}
4743
4744// Copy the lower 64-bit integer in a to dst.
4745// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
4747{
4748 return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
4749}
4750
4751// Copy the 64-bit integer a to the lower element of dst, and zero the upper
4752// element.
4753// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
4755{
4757 vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
4758}
4759
4760// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
4761// a and b, and store the unsigned 64-bit results in dst.
4762// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32
4764{
4765 // vmull_u32 upcasts instead of masking, so we downcast.
4766 uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
4767 uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
4768 return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
4769}
4770
4771// Multiply packed double-precision (64-bit) floating-point elements in a and b,
4772// and store the results in dst.
4773// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
4775{
4776#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4777 return vreinterpretq_m128d_f64(
4778 vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4779#else
4780 double a0 =
4781 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
4782 double a1 =
4783 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
4784 double b0 =
4785 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
4786 double b1 =
4787 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
4788 double c[2];
4789 c[0] = a0 * b0;
4790 c[1] = a1 * b1;
4791 return vld1q_f32((float32_t *) c);
4792#endif
4793}
4794
4795// Multiply the lower double-precision (64-bit) floating-point element in a and
4796// b, store the result in the lower element of dst, and copy the upper element
4797// from a to the upper element of dst.
4798// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
4800{
4801 return _mm_move_sd(a, _mm_mul_pd(a, b));
4802}
4803
4804// Multiply the low unsigned 32-bit integers from a and b, and store the
4805// unsigned 64-bit result in dst.
4806// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
4807FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
4808{
4809 return vreinterpret_m64_u64(vget_low_u64(
4810 vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
4811}
4812
4813// Multiply the packed signed 16-bit integers in a and b, producing intermediate
4814// 32-bit integers, and store the high 16 bits of the intermediate integers in
4815// dst.
4816// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16
4818{
4819 /* FIXME: issue with large values because of result saturation */
4820 // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
4821 // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
4822 // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
4823 int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
4824 int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
4825 int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
4826 int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
4827 int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
4828 int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
4829 uint16x8x2_t r =
4830 vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4831 return vreinterpretq_m128i_u16(r.val[1]);
4832}
4833
4834// Multiply the packed unsigned 16-bit integers in a and b, producing
4835// intermediate 32-bit integers, and store the high 16 bits of the intermediate
4836// integers in dst.
4837// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
4839{
4840 uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
4841 uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
4842 uint32x4_t ab3210 = vmull_u16(a3210, b3210);
4843#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
4844 uint32x4_t ab7654 =
4845 vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
4846 uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
4847 vreinterpretq_u16_u32(ab7654));
4848 return vreinterpretq_m128i_u16(r);
4849#else
4850 uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
4851 uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
4852 uint32x4_t ab7654 = vmull_u16(a7654, b7654);
4853 uint16x8x2_t r =
4854 vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4855 return vreinterpretq_m128i_u16(r.val[1]);
4856#endif
4857}
4858
4859// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
4860// integers, and store the low 16 bits of the intermediate integers in dst.
4861// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16
4863{
4866}
4867
4868// Compute the bitwise OR of packed double-precision (64-bit) floating-point
4869// elements in a and b, and store the results in dst.
4870// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
4872{
4875}
4876
4877// Compute the bitwise OR of 128 bits (representing integer data) in a and b,
4878// and store the result in dst.
4879// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128
4881{
4884}
4885
4886// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
4887// using signed saturation, and store the results in dst.
4888// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16
4890{
4892 vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
4893 vqmovn_s16(vreinterpretq_s16_m128i(b))));
4894}
4895
4896// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
4897// using signed saturation, and store the results in dst.
4898// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32
4900{
4902 vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
4903 vqmovn_s32(vreinterpretq_s32_m128i(b))));
4904}
4905
4906// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
4907// using unsigned saturation, and store the results in dst.
4908// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16
4909FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
4910{
4912 vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
4913 vqmovun_s16(vreinterpretq_s16_m128i(b))));
4914}
4915
4916// Pause the processor. This is typically used in spin-wait loops and depending
4917// on the x86 processor typical values are in the 40-100 cycle range. The
4918// 'yield' instruction isn't a good fit because it's effectively a nop on most
4919// Arm cores. Experience with several databases has shown has shown an 'isb' is
4920// a reasonable approximation.
4921// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
4922FORCE_INLINE void _mm_pause(void)
4923{
4924#if defined(_MSC_VER) && !defined(__clang__)
4925 __isb(_ARM64_BARRIER_SY);
4926#else
4927 __asm__ __volatile__("isb\n");
4928#endif
4929}
4930
4931// Compute the absolute differences of packed unsigned 8-bit integers in a and
4932// b, then horizontally sum each consecutive 8 differences to produce two
4933// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
4934// 16 bits of 64-bit elements in dst.
4935// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
4937{
4938 uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
4939 return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
4940}
4941
4942// Set packed 16-bit integers in dst with the supplied values.
4943// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16
4944FORCE_INLINE __m128i _mm_set_epi16(short i7,
4945 short i6,
4946 short i5,
4947 short i4,
4948 short i3,
4949 short i2,
4950 short i1,
4951 short i0)
4952{
4953 int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
4954 return vreinterpretq_m128i_s16(vld1q_s16(data));
4955}
4956
4957// Set packed 32-bit integers in dst with the supplied values.
4958// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32
4959FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
4960{
4961 int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
4962 return vreinterpretq_m128i_s32(vld1q_s32(data));
4963}
4964
4965// Set packed 64-bit integers in dst with the supplied values.
4966// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64
4967FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
4968{
4969 return _mm_set_epi64x(vget_lane_s64(i1, 0), vget_lane_s64(i2, 0));
4970}
4971
4972// Set packed 64-bit integers in dst with the supplied values.
4973// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x
4974FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
4975{
4977 vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
4978}
4979
4980// Set packed 8-bit integers in dst with the supplied values.
4981// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8
4982FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
4983 signed char b14,
4984 signed char b13,
4985 signed char b12,
4986 signed char b11,
4987 signed char b10,
4988 signed char b9,
4989 signed char b8,
4990 signed char b7,
4991 signed char b6,
4992 signed char b5,
4993 signed char b4,
4994 signed char b3,
4995 signed char b2,
4996 signed char b1,
4997 signed char b0)
4998{
4999 int8_t ALIGN_STRUCT(16) data[16] = {
5000 (int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
5001 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
5002 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
5003 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5004 return (__m128i) vld1q_s8(data);
5005}
5006
5007// Set packed double-precision (64-bit) floating-point elements in dst with the
5008// supplied values.
5009// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
5010FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
5011{
5012 double ALIGN_STRUCT(16) data[2] = {e0, e1};
5013#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5014 return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
5015#else
5016 return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
5017#endif
5018}
5019
5020// Broadcast double-precision (64-bit) floating-point value a to all elements of
5021// dst.
5022// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
5023#define _mm_set_pd1 _mm_set1_pd
5024
5025// Copy double-precision (64-bit) floating-point element a to the lower element
5026// of dst, and zero the upper element.
5027// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
5028FORCE_INLINE __m128d _mm_set_sd(double a)
5029{
5030#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5031 return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
5032#else
5033 return _mm_set_pd(0, a);
5034#endif
5035}
5036
5037// Broadcast 16-bit integer a to all elements of dst.
5038// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16
5039FORCE_INLINE __m128i _mm_set1_epi16(short w)
5040{
5041 return vreinterpretq_m128i_s16(vdupq_n_s16(w));
5042}
5043
5044// Broadcast 32-bit integer a to all elements of dst.
5045// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
5046FORCE_INLINE __m128i _mm_set1_epi32(int _i)
5047{
5048 return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
5049}
5050
5051// Broadcast 64-bit integer a to all elements of dst.
5052// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64
5054{
5055 return vreinterpretq_m128i_s64(vdupq_lane_s64(_i, 0));
5056}
5057
5058// Broadcast 64-bit integer a to all elements of dst.
5059// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
5060FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
5061{
5062 return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
5063}
5064
5065// Broadcast 8-bit integer a to all elements of dst.
5066// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8
5067FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
5068{
5069 return vreinterpretq_m128i_s8(vdupq_n_s8(w));
5070}
5071
5072// Broadcast double-precision (64-bit) floating-point value a to all elements of
5073// dst.
5074// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
5075FORCE_INLINE __m128d _mm_set1_pd(double d)
5076{
5077#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5078 return vreinterpretq_m128d_f64(vdupq_n_f64(d));
5079#else
5080 int64_t _d = sse2neon_recast_f64_s64(d);
5081 return vreinterpretq_m128d_s64(vdupq_n_s64(_d));
5082#endif
5083}
5084
5085// Set packed 16-bit integers in dst with the supplied values in reverse order.
5086// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16
5087FORCE_INLINE __m128i _mm_setr_epi16(short w0,
5088 short w1,
5089 short w2,
5090 short w3,
5091 short w4,
5092 short w5,
5093 short w6,
5094 short w7)
5095{
5096 int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
5097 return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
5098}
5099
5100// Set packed 32-bit integers in dst with the supplied values in reverse order.
5101// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32
5102FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
5103{
5104 int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
5105 return vreinterpretq_m128i_s32(vld1q_s32(data));
5106}
5107
5108// Set packed 64-bit integers in dst with the supplied values in reverse order.
5109// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
5110FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
5111{
5112 return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
5113}
5114
5115// Set packed 8-bit integers in dst with the supplied values in reverse order.
5116// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8
5117FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
5118 signed char b1,
5119 signed char b2,
5120 signed char b3,
5121 signed char b4,
5122 signed char b5,
5123 signed char b6,
5124 signed char b7,
5125 signed char b8,
5126 signed char b9,
5127 signed char b10,
5128 signed char b11,
5129 signed char b12,
5130 signed char b13,
5131 signed char b14,
5132 signed char b15)
5133{
5134 int8_t ALIGN_STRUCT(16) data[16] = {
5135 (int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
5136 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
5137 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
5138 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5139 return (__m128i) vld1q_s8(data);
5140}
5141
5142// Set packed double-precision (64-bit) floating-point elements in dst with the
5143// supplied values in reverse order.
5144// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
5145FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
5146{
5147 return _mm_set_pd(e0, e1);
5148}
5149
5150// Return vector of type __m128d with all elements set to zero.
5151// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
5152FORCE_INLINE __m128d _mm_setzero_pd(void)
5153{
5154#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5155 return vreinterpretq_m128d_f64(vdupq_n_f64(0));
5156#else
5157 return vreinterpretq_m128d_f32(vdupq_n_f32(0));
5158#endif
5159}
5160
5161// Return vector of type __m128i with all elements set to zero.
5162// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128
5163FORCE_INLINE __m128i _mm_setzero_si128(void)
5164{
5165 return vreinterpretq_m128i_s32(vdupq_n_s32(0));
5166}
5167
5168// Shuffle 32-bit integers in a using the control in imm8, and store the results
5169// in dst.
5170// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32
5171// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
5172// __constrange(0,255) int imm)
5173#if defined(_sse2neon_shuffle)
5174#define _mm_shuffle_epi32(a, imm) \
5175 __extension__({ \
5176 int32x4_t _input = vreinterpretq_s32_m128i(a); \
5177 int32x4_t _shuf = \
5178 vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
5179 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
5180 vreinterpretq_m128i_s32(_shuf); \
5181 })
5182#else // generic
5183#define _mm_shuffle_epi32(a, imm) \
5184 _sse2neon_define1( \
5185 __m128i, a, __m128i ret; switch (imm) { \
5186 case _MM_SHUFFLE(1, 0, 3, 2): \
5187 ret = _mm_shuffle_epi_1032(_a); \
5188 break; \
5189 case _MM_SHUFFLE(2, 3, 0, 1): \
5190 ret = _mm_shuffle_epi_2301(_a); \
5191 break; \
5192 case _MM_SHUFFLE(0, 3, 2, 1): \
5193 ret = _mm_shuffle_epi_0321(_a); \
5194 break; \
5195 case _MM_SHUFFLE(2, 1, 0, 3): \
5196 ret = _mm_shuffle_epi_2103(_a); \
5197 break; \
5198 case _MM_SHUFFLE(1, 0, 1, 0): \
5199 ret = _mm_shuffle_epi_1010(_a); \
5200 break; \
5201 case _MM_SHUFFLE(1, 0, 0, 1): \
5202 ret = _mm_shuffle_epi_1001(_a); \
5203 break; \
5204 case _MM_SHUFFLE(0, 1, 0, 1): \
5205 ret = _mm_shuffle_epi_0101(_a); \
5206 break; \
5207 case _MM_SHUFFLE(2, 2, 1, 1): \
5208 ret = _mm_shuffle_epi_2211(_a); \
5209 break; \
5210 case _MM_SHUFFLE(0, 1, 2, 2): \
5211 ret = _mm_shuffle_epi_0122(_a); \
5212 break; \
5213 case _MM_SHUFFLE(3, 3, 3, 2): \
5214 ret = _mm_shuffle_epi_3332(_a); \
5215 break; \
5216 case _MM_SHUFFLE(0, 0, 0, 0): \
5217 ret = _mm_shuffle_epi32_splat(_a, 0); \
5218 break; \
5219 case _MM_SHUFFLE(1, 1, 1, 1): \
5220 ret = _mm_shuffle_epi32_splat(_a, 1); \
5221 break; \
5222 case _MM_SHUFFLE(2, 2, 2, 2): \
5223 ret = _mm_shuffle_epi32_splat(_a, 2); \
5224 break; \
5225 case _MM_SHUFFLE(3, 3, 3, 3): \
5226 ret = _mm_shuffle_epi32_splat(_a, 3); \
5227 break; \
5228 default: \
5229 ret = _mm_shuffle_epi32_default(_a, (imm)); \
5230 break; \
5231 } _sse2neon_return(ret);)
5232#endif
5233
5234// Shuffle double-precision (64-bit) floating-point elements using the control
5235// in imm8, and store the results in dst.
5236// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
5237#ifdef _sse2neon_shuffle
5238#define _mm_shuffle_pd(a, b, imm8) \
5239 vreinterpretq_m128d_s64( \
5240 vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
5241 (imm8) & 0x1, (((imm8) & 0x2) >> 1) + 2))
5242#else
5243#define _mm_shuffle_pd(a, b, imm8) \
5244 _mm_castsi128_pd(_mm_set_epi64x( \
5245 vgetq_lane_s64(vreinterpretq_s64_m128d(b), ((imm8) & 0x2) >> 1), \
5246 vgetq_lane_s64(vreinterpretq_s64_m128d(a), (imm8) & 0x1)))
5247#endif
5248
5249// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
5250// __constrange(0,255) int imm)
5251#if defined(_sse2neon_shuffle)
5252#define _mm_shufflehi_epi16(a, imm) \
5253 __extension__({ \
5254 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5255 int16x8_t _shuf = \
5256 vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \
5257 (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
5258 (((imm) >> 6) & 0x3) + 4); \
5259 vreinterpretq_m128i_s16(_shuf); \
5260 })
5261#else // generic
5262#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
5263#endif
5264
5265// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
5266// __constrange(0,255) int imm)
5267#if defined(_sse2neon_shuffle)
5268#define _mm_shufflelo_epi16(a, imm) \
5269 __extension__({ \
5270 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5271 int16x8_t _shuf = vshuffleq_s16( \
5272 _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
5273 (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
5274 vreinterpretq_m128i_s16(_shuf); \
5275 })
5276#else // generic
5277#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
5278#endif
5279
5280// Shift packed 16-bit integers in a left by count while shifting in zeros, and
5281// store the results in dst.
5282// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
5283FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
5284{
5285 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5286 if (_sse2neon_unlikely(c & ~15))
5287 return _mm_setzero_si128();
5288
5289 int16x8_t vc = vdupq_n_s16((int16_t) c);
5290 return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
5291}
5292
5293// Shift packed 32-bit integers in a left by count while shifting in zeros, and
5294// store the results in dst.
5295// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
5296FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
5297{
5298 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5299 if (_sse2neon_unlikely(c & ~31))
5300 return _mm_setzero_si128();
5301
5302 int32x4_t vc = vdupq_n_s32((int32_t) c);
5303 return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
5304}
5305
5306// Shift packed 64-bit integers in a left by count while shifting in zeros, and
5307// store the results in dst.
5308// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
5309FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
5310{
5311 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5312 if (_sse2neon_unlikely(c & ~63))
5313 return _mm_setzero_si128();
5314
5315 int64x2_t vc = vdupq_n_s64((int64_t) c);
5316 return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
5317}
5318
5319// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
5320// store the results in dst.
5321// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
5322FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
5323{
5324 if (_sse2neon_unlikely(imm & ~15))
5325 return _mm_setzero_si128();
5327 vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16((int16_t) imm)));
5328}
5329
5330// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
5331// store the results in dst.
5332// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
5333FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
5334{
5335 if (_sse2neon_unlikely(imm & ~31))
5336 return _mm_setzero_si128();
5338 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
5339}
5340
5341// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
5342// store the results in dst.
5343// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
5344FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
5345{
5346 if (_sse2neon_unlikely(imm & ~63))
5347 return _mm_setzero_si128();
5349 vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
5350}
5351
5352// Shift a left by imm8 bytes while shifting in zeros, and store the results in
5353// dst.
5354// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
5355#define _mm_slli_si128(a, imm) \
5356 _sse2neon_define1( \
5357 __m128i, a, int8x16_t ret; \
5358 if (_sse2neon_unlikely((imm) == 0)) ret = vreinterpretq_s8_m128i(_a); \
5359 else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \
5360 else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a), \
5361 (((imm) <= 0 || (imm) > 15) ? 0 : (16 - (imm)))); \
5362 _sse2neon_return(vreinterpretq_m128i_s8(ret));)
5363
5364// Compute the square root of packed double-precision (64-bit) floating-point
5365// elements in a, and store the results in dst.
5366// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
5368{
5369#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5370 return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
5371#else
5372 double a0, a1;
5373 a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
5374 a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
5375 double _a0 = sqrt(a0);
5376 double _a1 = sqrt(a1);
5377 return _mm_set_pd(_a1, _a0);
5378#endif
5379}
5380
5381// Compute the square root of the lower double-precision (64-bit) floating-point
5382// element in b, store the result in the lower element of dst, and copy the
5383// upper element from a to the upper element of dst.
5384// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
5386{
5387#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5388 return _mm_move_sd(a, _mm_sqrt_pd(b));
5389#else
5390 double _a, _b;
5391 _a = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
5392 _b = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
5393 return _mm_set_pd(_a, sqrt(_b));
5394#endif
5395}
5396
5397// Shift packed 16-bit integers in a right by count while shifting in sign bits,
5398// and store the results in dst.
5399// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
5400FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
5401{
5402 int64_t c = vgetq_lane_s64(count, 0);
5403 if (_sse2neon_unlikely(c & ~15))
5406 vshlq_s16((int16x8_t) a, vdupq_n_s16((int16_t) -c)));
5407}
5408
5409// Shift packed 32-bit integers in a right by count while shifting in sign bits,
5410// and store the results in dst.
5411// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
5412FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
5413{
5414 int64_t c = vgetq_lane_s64(count, 0);
5415 if (_sse2neon_unlikely(c & ~31))
5418 vshlq_s32((int32x4_t) a, vdupq_n_s32((int) -c)));
5419}
5420
5421// Shift packed 16-bit integers in a right by imm8 while shifting in sign
5422// bits, and store the results in dst.
5423// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
5424FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
5425{
5426 const int16_t count = (imm & ~15) ? 15 : (int16_t) imm;
5427 return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
5428}
5429
5430// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
5431// and store the results in dst.
5432// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
5433// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
5434#define _mm_srai_epi32(a, imm) \
5435 _sse2neon_define0( \
5436 __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) == 0)) { \
5437 ret = _a; \
5438 } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \
5439 ret = vreinterpretq_m128i_s32( \
5440 vshlq_s32(vreinterpretq_s32_m128i(_a), vdupq_n_s32(-(imm)))); \
5441 } else { \
5442 ret = vreinterpretq_m128i_s32( \
5443 vshrq_n_s32(vreinterpretq_s32_m128i(_a), 31)); \
5444 } _sse2neon_return(ret);)
5445
5446// Shift packed 16-bit integers in a right by count while shifting in zeros, and
5447// store the results in dst.
5448// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
5449FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
5450{
5451 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5452 if (_sse2neon_unlikely(c & ~15))
5453 return _mm_setzero_si128();
5454
5455 int16x8_t vc = vdupq_n_s16(-(int16_t) c);
5456 return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
5457}
5458
5459// Shift packed 32-bit integers in a right by count while shifting in zeros, and
5460// store the results in dst.
5461// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
5462FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
5463{
5464 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5465 if (_sse2neon_unlikely(c & ~31))
5466 return _mm_setzero_si128();
5467
5468 int32x4_t vc = vdupq_n_s32(-(int32_t) c);
5469 return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
5470}
5471
5472// Shift packed 64-bit integers in a right by count while shifting in zeros, and
5473// store the results in dst.
5474// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
5475FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
5476{
5477 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5478 if (_sse2neon_unlikely(c & ~63))
5479 return _mm_setzero_si128();
5480
5481 int64x2_t vc = vdupq_n_s64(-(int64_t) c);
5482 return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
5483}
5484
5485// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
5486// store the results in dst.
5487// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
5488#define _mm_srli_epi16(a, imm) \
5489 _sse2neon_define0( \
5490 __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) { \
5491 ret = _mm_setzero_si128(); \
5492 } else { \
5493 ret = vreinterpretq_m128i_u16(vshlq_u16( \
5494 vreinterpretq_u16_m128i(_a), vdupq_n_s16((int16_t) - (imm)))); \
5495 } _sse2neon_return(ret);)
5496
5497// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
5498// store the results in dst.
5499// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
5500// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
5501#define _mm_srli_epi32(a, imm) \
5502 _sse2neon_define0( \
5503 __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~31)) { \
5504 ret = _mm_setzero_si128(); \
5505 } else { \
5506 ret = vreinterpretq_m128i_u32( \
5507 vshlq_u32(vreinterpretq_u32_m128i(_a), vdupq_n_s32(-(imm)))); \
5508 } _sse2neon_return(ret);)
5509
5510// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
5511// store the results in dst.
5512// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
5513#define _mm_srli_epi64(a, imm) \
5514 _sse2neon_define0( \
5515 __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~63)) { \
5516 ret = _mm_setzero_si128(); \
5517 } else { \
5518 ret = vreinterpretq_m128i_u64( \
5519 vshlq_u64(vreinterpretq_u64_m128i(_a), vdupq_n_s64(-(imm)))); \
5520 } _sse2neon_return(ret);)
5521
5522// Shift a right by imm8 bytes while shifting in zeros, and store the results in
5523// dst.
5524// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
5525#define _mm_srli_si128(a, imm) \
5526 _sse2neon_define1( \
5527 __m128i, a, int8x16_t ret; \
5528 if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \
5529 else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \
5530 ((imm) > 15 ? 0 : (imm))); \
5531 _sse2neon_return(vreinterpretq_m128i_s8(ret));)
5532
5533// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5534// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
5535// or a general-protection exception may be generated.
5536// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
5537FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
5538{
5539#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5540 vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
5541#else
5542 vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
5543#endif
5544}
5545
5546// Store the lower double-precision (64-bit) floating-point element from a into
5547// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5548// boundary or a general-protection exception may be generated.
5549// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
5550FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
5551{
5552#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5553 float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
5554 vst1q_f64((float64_t *) mem_addr,
5555 vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
5556#else
5557 float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
5558 vst1q_f32((float32_t *) mem_addr,
5559 vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
5560#endif
5561}
5562
5563// Store the lower double-precision (64-bit) floating-point element from a into
5564// memory. mem_addr does not need to be aligned on any particular boundary.
5565// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
5566FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
5567{
5568#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5569 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5570#else
5571 vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
5572#endif
5573}
5574
5575// Store 128-bits of integer data from a into memory. mem_addr must be aligned
5576// on a 16-byte boundary or a general-protection exception may be generated.
5577// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128
5578FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
5579{
5580 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5581}
5582
5583// Store the lower double-precision (64-bit) floating-point element from a into
5584// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5585// boundary or a general-protection exception may be generated.
5586// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
5587#define _mm_store1_pd _mm_store_pd1
5588
5589// Store the upper double-precision (64-bit) floating-point element from a into
5590// memory.
5591// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
5592FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
5593{
5594#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5595 vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
5596#else
5597 vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
5598#endif
5599}
5600
5601// Store 64-bit integer from the first element of a into memory.
5602// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64
5603FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
5604{
5605 vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
5606}
5607
5608// Store the lower double-precision (64-bit) floating-point element from a into
5609// memory.
5610// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
5611FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
5612{
5613#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5614 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5615#else
5616 vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
5617#endif
5618}
5619
5620// Store 2 double-precision (64-bit) floating-point elements from a into memory
5621// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
5622// general-protection exception may be generated.
5623// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
5624FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
5625{
5626 float32x4_t f = vreinterpretq_f32_m128d(a);
5627 _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
5628}
5629
5630// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5631// elements) from a into memory. mem_addr does not need to be aligned on any
5632// particular boundary.
5633// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
5634FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
5635{
5636 _mm_store_pd(mem_addr, a);
5637}
5638
5639// Store 128-bits of integer data from a into memory. mem_addr does not need to
5640// be aligned on any particular boundary.
5641// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
5642FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
5643{
5644 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5645}
5646
5647// Store 32-bit integer from the first element of a into memory. mem_addr does
5648// not need to be aligned on any particular boundary.
5649// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
5650FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
5651{
5652 vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
5653}
5654
5655// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5656// elements) from a into memory using a non-temporal memory hint. mem_addr must
5657// be aligned on a 16-byte boundary or a general-protection exception may be
5658// generated.
5659// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
5660FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
5661{
5662#if __has_builtin(__builtin_nontemporal_store)
5663 __builtin_nontemporal_store(a, (__m128d *) p);
5664#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5665 vst1q_f64(p, vreinterpretq_f64_m128d(a));
5666#else
5667 vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
5668#endif
5669}
5670
5671// Store 128-bits of integer data from a into memory using a non-temporal memory
5672// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection
5673// exception may be generated.
5674// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
5675FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
5676{
5677#if __has_builtin(__builtin_nontemporal_store)
5678 __builtin_nontemporal_store(a, p);
5679#else
5680 vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
5681#endif
5682}
5683
5684// Store 32-bit integer a into memory using a non-temporal hint to minimize
5685// cache pollution. If the cache line containing address mem_addr is already in
5686// the cache, the cache will be updated.
5687// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
5688FORCE_INLINE void _mm_stream_si32(int *p, int a)
5689{
5690 vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
5691}
5692
5693// Store 64-bit integer a into memory using a non-temporal hint to minimize
5694// cache pollution. If the cache line containing address mem_addr is already in
5695// the cache, the cache will be updated.
5696// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
5697FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
5698{
5699 vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
5700}
5701
5702// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
5703// store the results in dst.
5704// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
5706{
5709}
5710
5711// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
5712// store the results in dst.
5713// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
5715{
5718}
5719
5720// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and
5721// store the results in dst.
5722// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64
5724{
5727}
5728
5729// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
5730// store the results in dst.
5731// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
5733{
5736}
5737
5738// Subtract packed double-precision (64-bit) floating-point elements in b from
5739// packed double-precision (64-bit) floating-point elements in a, and store the
5740// results in dst.
5741// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
5743{
5744#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5745 return vreinterpretq_m128d_f64(
5746 vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5747#else
5748 double a0 =
5749 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
5750 double a1 =
5751 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
5752 double b0 =
5753 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
5754 double b1 =
5755 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
5756 double c[2];
5757 c[0] = a0 - b0;
5758 c[1] = a1 - b1;
5759 return vld1q_f32((float32_t *) c);
5760#endif
5761}
5762
5763// Subtract the lower double-precision (64-bit) floating-point element in b from
5764// the lower double-precision (64-bit) floating-point element in a, store the
5765// result in the lower element of dst, and copy the upper element from a to the
5766// upper element of dst.
5767// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
5769{
5770 return _mm_move_sd(a, _mm_sub_pd(a, b));
5771}
5772
5773// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
5774// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
5775FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
5776{
5777 return vreinterpret_m64_s64(
5779}
5780
5781// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a
5782// using saturation, and store the results in dst.
5783// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16
5785{
5788}
5789
5790// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a
5791// using saturation, and store the results in dst.
5792// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8
5794{
5797}
5798
5799// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit
5800// integers in a using saturation, and store the results in dst.
5801// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16
5803{
5806}
5807
5808// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit
5809// integers in a using saturation, and store the results in dst.
5810// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8
5812{
5815}
5816
5817#define _mm_ucomieq_sd _mm_comieq_sd
5818#define _mm_ucomige_sd _mm_comige_sd
5819#define _mm_ucomigt_sd _mm_comigt_sd
5820#define _mm_ucomile_sd _mm_comile_sd
5821#define _mm_ucomilt_sd _mm_comilt_sd
5822#define _mm_ucomineq_sd _mm_comineq_sd
5823
5824// Return vector of type __m128d with undefined elements.
5825// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
5826FORCE_INLINE __m128d _mm_undefined_pd(void)
5827{
5828#if defined(__GNUC__) || defined(__clang__)
5829#pragma GCC diagnostic push
5830#pragma GCC diagnostic ignored "-Wuninitialized"
5831#endif
5832 __m128d a;
5833#if defined(_MSC_VER) && !defined(__clang__)
5834 a = _mm_setzero_pd();
5835#endif
5836 return a;
5837#if defined(__GNUC__) || defined(__clang__)
5838#pragma GCC diagnostic pop
5839#endif
5840}
5841
5842// Unpack and interleave 16-bit integers from the high half of a and b, and
5843// store the results in dst.
5844// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
5846{
5847#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5850#else
5851 int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
5852 int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
5853 int16x4x2_t result = vzip_s16(a1, b1);
5854 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5855#endif
5856}
5857
5858// Unpack and interleave 32-bit integers from the high half of a and b, and
5859// store the results in dst.
5860// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
5862{
5863#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5866#else
5867 int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
5868 int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
5869 int32x2x2_t result = vzip_s32(a1, b1);
5870 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5871#endif
5872}
5873
5874// Unpack and interleave 64-bit integers from the high half of a and b, and
5875// store the results in dst.
5876// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
5878{
5879#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5882#else
5883 int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
5884 int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
5885 return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
5886#endif
5887}
5888
5889// Unpack and interleave 8-bit integers from the high half of a and b, and store
5890// the results in dst.
5891// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
5893{
5894#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5897#else
5898 int8x8_t a1 =
5899 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
5900 int8x8_t b1 =
5901 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
5902 int8x8x2_t result = vzip_s8(a1, b1);
5903 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5904#endif
5905}
5906
5907// Unpack and interleave double-precision (64-bit) floating-point elements from
5908// the high half of a and b, and store the results in dst.
5909// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
5911{
5912#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5913 return vreinterpretq_m128d_f64(
5914 vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5915#else
5917 vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
5918 vget_high_s64(vreinterpretq_s64_m128d(b))));
5919#endif
5920}
5921
5922// Unpack and interleave 16-bit integers from the low half of a and b, and store
5923// the results in dst.
5924// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
5926{
5927#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5930#else
5931 int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
5932 int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
5933 int16x4x2_t result = vzip_s16(a1, b1);
5934 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5935#endif
5936}
5937
5938// Unpack and interleave 32-bit integers from the low half of a and b, and store
5939// the results in dst.
5940// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
5942{
5943#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5946#else
5947 int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
5948 int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
5949 int32x2x2_t result = vzip_s32(a1, b1);
5950 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5951#endif
5952}
5953
5954// Unpack and interleave 64-bit integers from the low half of a and b, and store
5955// the results in dst.
5956// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
5958{
5959#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5962#else
5963 int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
5964 int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
5965 return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
5966#endif
5967}
5968
5969// Unpack and interleave 8-bit integers from the low half of a and b, and store
5970// the results in dst.
5971// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
5973{
5974#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5977#else
5978 int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
5979 int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
5980 int8x8x2_t result = vzip_s8(a1, b1);
5981 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5982#endif
5983}
5984
5985// Unpack and interleave double-precision (64-bit) floating-point elements from
5986// the low half of a and b, and store the results in dst.
5987// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
5989{
5990#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
5991 return vreinterpretq_m128d_f64(
5992 vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5993#else
5995 vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
5996 vget_low_s64(vreinterpretq_s64_m128d(b))));
5997#endif
5998}
5999
6000// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
6001// elements in a and b, and store the results in dst.
6002// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
6004{
6007}
6008
6009// Compute the bitwise XOR of 128 bits (representing integer data) in a and b,
6010// and store the result in dst.
6011// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128
6013{
6016}
6017
6018/* SSE3 */
6019
6020// Alternatively add and subtract packed double-precision (64-bit)
6021// floating-point elements in a to/from packed elements in b, and store the
6022// results in dst.
6023// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
6025{
6026 _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
6027#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6028 return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
6029 vreinterpretq_f64_m128d(b),
6030 vreinterpretq_f64_m128d(mask)));
6031#else
6032 return _mm_add_pd(_mm_mul_pd(b, mask), a);
6033#endif
6034}
6035
6036// Alternatively add and subtract packed single-precision (32-bit)
6037// floating-point elements in a to/from packed elements in b, and store the
6038// results in dst.
6039// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
6041{
6042 _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
6043#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
6044 defined(__ARM_FEATURE_FMA) /* VFPv4+ */
6048#else
6049 return _mm_add_ps(_mm_mul_ps(b, mask), a);
6050#endif
6051}
6052
6053// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
6054// elements in a and b, and pack the results in dst.
6055// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
6057{
6058#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6059 return vreinterpretq_m128d_f64(
6060 vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6061#else
6062 double a0 =
6063 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
6064 double a1 =
6065 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
6066 double b0 =
6067 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
6068 double b1 =
6069 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
6070 double c[] = {a0 + a1, b0 + b1};
6071 return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
6072#endif
6073}
6074
6075// Horizontally add adjacent pairs of single-precision (32-bit) floating-point
6076// elements in a and b, and pack the results in dst.
6077// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
6079{
6080#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6083#else
6084 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
6085 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
6086 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
6087 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
6089 vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
6090#endif
6091}
6092
6093// Horizontally subtract adjacent pairs of double-precision (64-bit)
6094// floating-point elements in a and b, and pack the results in dst.
6095// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
6097{
6098#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6099 float64x2_t _a = vreinterpretq_f64_m128d(a);
6100 float64x2_t _b = vreinterpretq_f64_m128d(b);
6101 return vreinterpretq_m128d_f64(
6102 vsubq_f64(vuzp1q_f64(_a, _b), vuzp2q_f64(_a, _b)));
6103#else
6104 double a0 =
6105 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
6106 double a1 =
6107 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
6108 double b0 =
6109 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
6110 double b1 =
6111 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
6112 double c[] = {a0 - a1, b0 - b1};
6113 return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
6114#endif
6115}
6116
6117// Horizontally subtract adjacent pairs of single-precision (32-bit)
6118// floating-point elements in a and b, and pack the results in dst.
6119// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
6120FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
6121{
6122 float32x4_t a = vreinterpretq_f32_m128(_a);
6123 float32x4_t b = vreinterpretq_f32_m128(_b);
6124#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6126 vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
6127#else
6128 float32x4x2_t c = vuzpq_f32(a, b);
6129 return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
6130#endif
6131}
6132
6133// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
6134// may perform better than _mm_loadu_si128 when the data crosses a cache line
6135// boundary.
6136// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
6137#define _mm_lddqu_si128 _mm_loadu_si128
6138
6139// Load a double-precision (64-bit) floating-point element from memory into both
6140// elements of dst.
6141// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
6142#define _mm_loaddup_pd _mm_load1_pd
6143
6144// Duplicate the low double-precision (64-bit) floating-point element from a,
6145// and store the results in dst.
6146// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
6148{
6149#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6150 return vreinterpretq_m128d_f64(
6151 vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
6152#else
6154 vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
6155#endif
6156}
6157
6158// Duplicate odd-indexed single-precision (32-bit) floating-point elements
6159// from a, and store the results in dst.
6160// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
6162{
6163#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6166#elif defined(_sse2neon_shuffle)
6167 return vreinterpretq_m128_f32(vshuffleq_s32(
6169#else
6170 float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
6171 float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
6172 float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
6173 return vreinterpretq_m128_f32(vld1q_f32(data));
6174#endif
6175}
6176
6177// Duplicate even-indexed single-precision (32-bit) floating-point elements
6178// from a, and store the results in dst.
6179// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
6181{
6182#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6185#elif defined(_sse2neon_shuffle)
6186 return vreinterpretq_m128_f32(vshuffleq_s32(
6188#else
6189 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
6190 float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
6191 float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
6192 return vreinterpretq_m128_f32(vld1q_f32(data));
6193#endif
6194}
6195
6196/* SSSE3 */
6197
6198// Compute the absolute value of packed signed 16-bit integers in a, and store
6199// the unsigned results in dst.
6200// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
6202{
6204}
6205
6206// Compute the absolute value of packed signed 32-bit integers in a, and store
6207// the unsigned results in dst.
6208// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
6210{
6212}
6213
6214// Compute the absolute value of packed signed 8-bit integers in a, and store
6215// the unsigned results in dst.
6216// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
6218{
6220}
6221
6222// Compute the absolute value of packed signed 16-bit integers in a, and store
6223// the unsigned results in dst.
6224// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
6225FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
6226{
6227 return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
6228}
6229
6230// Compute the absolute value of packed signed 32-bit integers in a, and store
6231// the unsigned results in dst.
6232// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
6233FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
6234{
6235 return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
6236}
6237
6238// Compute the absolute value of packed signed 8-bit integers in a, and store
6239// the unsigned results in dst.
6240// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
6241FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
6242{
6243 return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
6244}
6245
6246// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
6247// the result right by imm8 bytes, and store the low 16 bytes in dst.
6248// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
6249#if defined(__GNUC__) && !defined(__clang__)
6250#define _mm_alignr_epi8(a, b, imm) \
6251 __extension__({ \
6252 uint8x16_t _a = vreinterpretq_u8_m128i(a); \
6253 uint8x16_t _b = vreinterpretq_u8_m128i(b); \
6254 __m128i ret; \
6255 if (_sse2neon_unlikely((imm) & ~31)) \
6256 ret = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
6257 else if ((imm) >= 16) \
6258 ret = _mm_srli_si128(a, (imm) >= 16 ? (imm) - 16 : 0); \
6259 else \
6260 ret = vreinterpretq_m128i_u8( \
6261 vextq_u8(_b, _a, (imm) < 16 ? (imm) : 0)); \
6262 ret; \
6263 })
6264
6265#else
6266#define _mm_alignr_epi8(a, b, imm) \
6267 _sse2neon_define2( \
6268 __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a); \
6269 uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret; \
6270 if (_sse2neon_unlikely((imm) & ~31)) ret = \
6271 vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
6272 else if ((imm) >= 16) ret = \
6273 _mm_srli_si128(_a, (imm) >= 16 ? (imm) - 16 : 0); \
6274 else ret = vreinterpretq_m128i_u8( \
6275 vextq_u8(__b, __a, (imm) < 16 ? (imm) : 0)); \
6276 _sse2neon_return(ret);)
6277
6278#endif
6279
6280// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
6281// the result right by imm8 bytes, and store the low 8 bytes in dst.
6282// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
6283#define _mm_alignr_pi8(a, b, imm) \
6284 _sse2neon_define2( \
6285 __m64, a, b, __m64 ret; if (_sse2neon_unlikely((imm) >= 16)) { \
6286 ret = vreinterpret_m64_s8(vdup_n_s8(0)); \
6287 } else { \
6288 uint8x8_t tmp_low; \
6289 uint8x8_t tmp_high; \
6290 if ((imm) >= 8) { \
6291 const int idx = (imm) - 8; \
6292 tmp_low = vreinterpret_u8_m64(_a); \
6293 tmp_high = vdup_n_u8(0); \
6294 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6295 } else { \
6296 const int idx = (imm); \
6297 tmp_low = vreinterpret_u8_m64(_b); \
6298 tmp_high = vreinterpret_u8_m64(_a); \
6299 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6300 } \
6301 } _sse2neon_return(ret);)
6302
6303// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
6304// signed 16-bit results in dst.
6305// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16
6307{
6308 int16x8_t a = vreinterpretq_s16_m128i(_a);
6309 int16x8_t b = vreinterpretq_s16_m128i(_b);
6310#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6311 return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
6312#else
6314 vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
6315 vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
6316#endif
6317}
6318
6319// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
6320// signed 32-bit results in dst.
6321// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32
6323{
6324 int32x4_t a = vreinterpretq_s32_m128i(_a);
6325 int32x4_t b = vreinterpretq_s32_m128i(_b);
6326#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6327 return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
6328#else
6330 vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
6331 vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
6332#endif
6333}
6334
6335// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
6336// signed 16-bit results in dst.
6337// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
6338FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
6339{
6340 return vreinterpret_m64_s16(
6341 vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
6342}
6343
6344// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
6345// signed 32-bit results in dst.
6346// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
6347FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
6348{
6349 return vreinterpret_m64_s32(
6350 vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
6351}
6352
6353// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
6354// saturation, and pack the signed 16-bit results in dst.
6355// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
6357{
6358#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6359 int16x8_t a = vreinterpretq_s16_m128i(_a);
6360 int16x8_t b = vreinterpretq_s16_m128i(_b);
6361 return vreinterpretq_s64_s16(
6362 vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6363#else
6364 int32x4_t a = vreinterpretq_s32_m128i(_a);
6365 int32x4_t b = vreinterpretq_s32_m128i(_b);
6366 // Interleave using vshrn/vmovn
6367 // [a0|a2|a4|a6|b0|b2|b4|b6]
6368 // [a1|a3|a5|a7|b1|b3|b5|b7]
6369 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6370 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6371 // Saturated add
6372 return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
6373#endif
6374}
6375
6376// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
6377// saturation, and pack the signed 16-bit results in dst.
6378// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
6379FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
6380{
6381 int16x4_t a = vreinterpret_s16_m64(_a);
6382 int16x4_t b = vreinterpret_s16_m64(_b);
6383#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6384 return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6385#else
6386 int16x4x2_t res = vuzp_s16(a, b);
6387 return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
6388#endif
6389}
6390
6391// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
6392// the signed 16-bit results in dst.
6393// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
6395{
6396 int16x8_t a = vreinterpretq_s16_m128i(_a);
6397 int16x8_t b = vreinterpretq_s16_m128i(_b);
6398#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6400 vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6401#else
6402 int16x8x2_t c = vuzpq_s16(a, b);
6403 return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
6404#endif
6405}
6406
6407// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
6408// the signed 32-bit results in dst.
6409// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
6411{
6412 int32x4_t a = vreinterpretq_s32_m128i(_a);
6413 int32x4_t b = vreinterpretq_s32_m128i(_b);
6414#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6416 vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
6417#else
6418 int32x4x2_t c = vuzpq_s32(a, b);
6419 return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
6420#endif
6421}
6422
6423// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
6424// the signed 16-bit results in dst.
6425// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
6426FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
6427{
6428 int16x4_t a = vreinterpret_s16_m64(_a);
6429 int16x4_t b = vreinterpret_s16_m64(_b);
6430#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6431 return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6432#else
6433 int16x4x2_t c = vuzp_s16(a, b);
6434 return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
6435#endif
6436}
6437
6438// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
6439// the signed 32-bit results in dst.
6440// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
6441FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
6442{
6443 int32x2_t a = vreinterpret_s32_m64(_a);
6444 int32x2_t b = vreinterpret_s32_m64(_b);
6445#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6446 return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
6447#else
6448 int32x2x2_t c = vuzp_s32(a, b);
6449 return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
6450#endif
6451}
6452
6453// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
6454// using saturation, and pack the signed 16-bit results in dst.
6455// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
6457{
6458 int16x8_t a = vreinterpretq_s16_m128i(_a);
6459 int16x8_t b = vreinterpretq_s16_m128i(_b);
6460#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6462 vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6463#else
6464 int16x8x2_t c = vuzpq_s16(a, b);
6465 return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
6466#endif
6467}
6468
6469// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
6470// using saturation, and pack the signed 16-bit results in dst.
6471// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
6472FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
6473{
6474 int16x4_t a = vreinterpret_s16_m64(_a);
6475 int16x4_t b = vreinterpret_s16_m64(_b);
6476#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6477 return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6478#else
6479 int16x4x2_t c = vuzp_s16(a, b);
6480 return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
6481#endif
6482}
6483
6484// Vertically multiply each unsigned 8-bit integer from a with the corresponding
6485// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
6486// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
6487// and pack the saturated results in dst.
6488// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
6490{
6491#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6492 uint8x16_t a = vreinterpretq_u8_m128i(_a);
6493 int8x16_t b = vreinterpretq_s8_m128i(_b);
6494 int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
6495 vmovl_s8(vget_low_s8(b)));
6496 int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
6497 vmovl_s8(vget_high_s8(b)));
6499 vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
6500#else
6501 // This would be much simpler if x86 would choose to zero extend OR sign
6502 // extend, not both. This could probably be optimized better.
6503 uint16x8_t a = vreinterpretq_u16_m128i(_a);
6504 int16x8_t b = vreinterpretq_s16_m128i(_b);
6505
6506 // Zero extend a
6507 int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
6508 int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
6509
6510 // Sign extend by shifting left then shifting right.
6511 int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
6512 int16x8_t b_odd = vshrq_n_s16(b, 8);
6513
6514 // multiply
6515 int16x8_t prod1 = vmulq_s16(a_even, b_even);
6516 int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
6517
6518 // saturated add
6519 return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
6520#endif
6521}
6522
6523// Vertically multiply each unsigned 8-bit integer from a with the corresponding
6524// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
6525// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
6526// pack the saturated results in dst.
6527// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
6528FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
6529{
6530 uint16x4_t a = vreinterpret_u16_m64(_a);
6531 int16x4_t b = vreinterpret_s16_m64(_b);
6532
6533 // Zero extend a
6534 int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
6535 int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
6536
6537 // Sign extend by shifting left then shifting right.
6538 int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
6539 int16x4_t b_odd = vshr_n_s16(b, 8);
6540
6541 // multiply
6542 int16x4_t prod1 = vmul_s16(a_even, b_even);
6543 int16x4_t prod2 = vmul_s16(a_odd, b_odd);
6544
6545 // saturated add
6546 return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
6547}
6548
6549// Multiply packed signed 16-bit integers in a and b, producing intermediate
6550// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
6551// the packed 16-bit integers in dst.
6552// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
6554{
6555 // Has issues due to saturation
6556 // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
6557
6558 // Multiply
6559 int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
6560 vget_low_s16(vreinterpretq_s16_m128i(b)));
6561 int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
6562 vget_high_s16(vreinterpretq_s16_m128i(b)));
6563
6564 // Rounding narrowing shift right
6565 // narrow = (int16_t)((mul + 16384) >> 15);
6566 int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
6567 int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
6568
6569 // Join together
6570 return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
6571}
6572
6573// Multiply packed signed 16-bit integers in a and b, producing intermediate
6574// signed 32-bit integers. Truncate each intermediate integer to the 18 most
6575// significant bits, round by adding 1, and store bits [16:1] to dst.
6576// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
6578{
6579 int32x4_t mul_extend =
6580 vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
6581
6582 // Rounding narrowing shift right
6583 return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
6584}
6585
6586// Shuffle packed 8-bit integers in a according to shuffle control mask in the
6587// corresponding 8-bit element of b, and store the results in dst.
6588// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
6590{
6591 int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a
6592 uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b
6593 uint8x16_t idx_masked =
6594 vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
6595#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6596 return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
6597#elif defined(__GNUC__)
6598 int8x16_t ret;
6599 // %e and %f represent the even and odd D registers
6600 // respectively.
6601 __asm__ __volatile__(
6602 "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
6603 "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
6604 : [ret] "=&w"(ret)
6605 : [tbl] "w"(tbl), [idx] "w"(idx_masked));
6606 return vreinterpretq_m128i_s8(ret);
6607#else
6608 // use this line if testing on aarch64
6609 int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
6611 vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
6612 vtbl2_s8(a_split, vget_high_u8(idx_masked))));
6613#endif
6614}
6615
6616// Shuffle packed 8-bit integers in a according to shuffle control mask in the
6617// corresponding 8-bit element of b, and store the results in dst.
6618// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
6620{
6621 const int8x8_t controlMask =
6622 vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
6623 int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
6624 return vreinterpret_m64_s8(res);
6625}
6626
6627// Negate packed 16-bit integers in a when the corresponding signed
6628// 16-bit integer in b is negative, and store the results in dst.
6629// Element in dst are zeroed out when the corresponding element
6630// in b is zero.
6631// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16
6633{
6634 int16x8_t a = vreinterpretq_s16_m128i(_a);
6635 int16x8_t b = vreinterpretq_s16_m128i(_b);
6636
6637 // signed shift right: faster than vclt
6638 // (b < 0) ? 0xFFFF : 0
6639 uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
6640 // (b == 0) ? 0xFFFF : 0
6641#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6642 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
6643#else
6644 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
6645#endif
6646
6647 // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
6648 // 'a') based on ltMask
6649 int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
6650 // res = masked & (~zeroMask)
6651 int16x8_t res = vbicq_s16(masked, zeroMask);
6652 return vreinterpretq_m128i_s16(res);
6653}
6654
6655// Negate packed 32-bit integers in a when the corresponding signed
6656// 32-bit integer in b is negative, and store the results in dst.
6657// Element in dst are zeroed out when the corresponding element
6658// in b is zero.
6659// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32
6661{
6662 int32x4_t a = vreinterpretq_s32_m128i(_a);
6663 int32x4_t b = vreinterpretq_s32_m128i(_b);
6664
6665 // signed shift right: faster than vclt
6666 // (b < 0) ? 0xFFFFFFFF : 0
6667 uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
6668
6669 // (b == 0) ? 0xFFFFFFFF : 0
6670#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6671 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
6672#else
6673 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
6674#endif
6675
6676 // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
6677 // 'a') based on ltMask
6678 int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
6679 // res = masked & (~zeroMask)
6680 int32x4_t res = vbicq_s32(masked, zeroMask);
6681 return vreinterpretq_m128i_s32(res);
6682}
6683
6684// Negate packed 8-bit integers in a when the corresponding signed
6685// 8-bit integer in b is negative, and store the results in dst.
6686// Element in dst are zeroed out when the corresponding element
6687// in b is zero.
6688// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8
6690{
6691 int8x16_t a = vreinterpretq_s8_m128i(_a);
6692 int8x16_t b = vreinterpretq_s8_m128i(_b);
6693
6694 // signed shift right: faster than vclt
6695 // (b < 0) ? 0xFF : 0
6696 uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
6697
6698 // (b == 0) ? 0xFF : 0
6699#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6700 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
6701#else
6702 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
6703#endif
6704
6705 // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
6706 // based on ltMask
6707 int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
6708 // res = masked & (~zeroMask)
6709 int8x16_t res = vbicq_s8(masked, zeroMask);
6710
6711 return vreinterpretq_m128i_s8(res);
6712}
6713
6714// Negate packed 16-bit integers in a when the corresponding signed 16-bit
6715// integer in b is negative, and store the results in dst. Element in dst are
6716// zeroed out when the corresponding element in b is zero.
6717// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
6718FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
6719{
6720 int16x4_t a = vreinterpret_s16_m64(_a);
6721 int16x4_t b = vreinterpret_s16_m64(_b);
6722
6723 // signed shift right: faster than vclt
6724 // (b < 0) ? 0xFFFF : 0
6725 uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
6726
6727 // (b == 0) ? 0xFFFF : 0
6728#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6729 int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
6730#else
6731 int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
6732#endif
6733
6734 // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
6735 // based on ltMask
6736 int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
6737 // res = masked & (~zeroMask)
6738 int16x4_t res = vbic_s16(masked, zeroMask);
6739
6740 return vreinterpret_m64_s16(res);
6741}
6742
6743// Negate packed 32-bit integers in a when the corresponding signed 32-bit
6744// integer in b is negative, and store the results in dst. Element in dst are
6745// zeroed out when the corresponding element in b is zero.
6746// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
6747FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
6748{
6749 int32x2_t a = vreinterpret_s32_m64(_a);
6750 int32x2_t b = vreinterpret_s32_m64(_b);
6751
6752 // signed shift right: faster than vclt
6753 // (b < 0) ? 0xFFFFFFFF : 0
6754 uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
6755
6756 // (b == 0) ? 0xFFFFFFFF : 0
6757#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6758 int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
6759#else
6760 int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
6761#endif
6762
6763 // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
6764 // based on ltMask
6765 int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
6766 // res = masked & (~zeroMask)
6767 int32x2_t res = vbic_s32(masked, zeroMask);
6768
6769 return vreinterpret_m64_s32(res);
6770}
6771
6772// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
6773// in b is negative, and store the results in dst. Element in dst are zeroed out
6774// when the corresponding element in b is zero.
6775// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
6776FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
6777{
6778 int8x8_t a = vreinterpret_s8_m64(_a);
6779 int8x8_t b = vreinterpret_s8_m64(_b);
6780
6781 // signed shift right: faster than vclt
6782 // (b < 0) ? 0xFF : 0
6783 uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
6784
6785 // (b == 0) ? 0xFF : 0
6786#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6787 int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
6788#else
6789 int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
6790#endif
6791
6792 // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
6793 // based on ltMask
6794 int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
6795 // res = masked & (~zeroMask)
6796 int8x8_t res = vbic_s8(masked, zeroMask);
6797
6798 return vreinterpret_m64_s8(res);
6799}
6800
6801/* SSE4.1 */
6802
6803// Blend packed 16-bit integers from a and b using control mask imm8, and store
6804// the results in dst.
6805// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16
6806// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
6807// __constrange(0,255) int imm)
6808#define _mm_blend_epi16(a, b, imm) \
6809 _sse2neon_define2( \
6810 __m128i, a, b, \
6811 const uint16_t _mask[8] = \
6812 _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) - 1 : 0x0, \
6813 ((imm) & (1 << 1)) ? (uint16_t) - 1 : 0x0, \
6814 ((imm) & (1 << 2)) ? (uint16_t) - 1 : 0x0, \
6815 ((imm) & (1 << 3)) ? (uint16_t) - 1 : 0x0, \
6816 ((imm) & (1 << 4)) ? (uint16_t) - 1 : 0x0, \
6817 ((imm) & (1 << 5)) ? (uint16_t) - 1 : 0x0, \
6818 ((imm) & (1 << 6)) ? (uint16_t) - 1 : 0x0, \
6819 ((imm) & (1 << 7)) ? (uint16_t) - 1 : 0x0); \
6820 uint16x8_t _mask_vec = vld1q_u16(_mask); \
6821 uint16x8_t __a = vreinterpretq_u16_m128i(_a); \
6822 uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \
6823 vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, __b, __a)));)
6824
6825// Blend packed double-precision (64-bit) floating-point elements from a and b
6826// using control mask imm8, and store the results in dst.
6827// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
6828#define _mm_blend_pd(a, b, imm) \
6829 _sse2neon_define2( \
6830 __m128d, a, b, \
6831 const uint64_t _mask[2] = \
6832 _sse2neon_init(((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \
6833 ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)); \
6834 uint64x2_t _mask_vec = vld1q_u64(_mask); \
6835 uint64x2_t __a = vreinterpretq_u64_m128d(_a); \
6836 uint64x2_t __b = vreinterpretq_u64_m128d(_b); _sse2neon_return( \
6837 vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, __b, __a)));)
6838
6839// Blend packed single-precision (32-bit) floating-point elements from a and b
6840// using mask, and store the results in dst.
6841// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
6842FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
6843{
6844 const uint32_t ALIGN_STRUCT(16) data[4] = {
6845 (imm8 & (1 << 0)) ? UINT32_MAX : 0, (imm8 & (1 << 1)) ? UINT32_MAX : 0,
6846 (imm8 & (1 << 2)) ? UINT32_MAX : 0, (imm8 & (1 << 3)) ? UINT32_MAX : 0};
6847 uint32x4_t mask = vld1q_u32(data);
6848 float32x4_t a = vreinterpretq_f32_m128(_a);
6849 float32x4_t b = vreinterpretq_f32_m128(_b);
6850 return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
6851}
6852
6853// Blend packed 8-bit integers from a and b using mask, and store the results in
6854// dst.
6855// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8
6856FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
6857{
6858 // Use a signed shift right to create a mask with the sign bit
6859 uint8x16_t mask =
6860 vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
6861 uint8x16_t a = vreinterpretq_u8_m128i(_a);
6862 uint8x16_t b = vreinterpretq_u8_m128i(_b);
6863 return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
6864}
6865
6866// Blend packed double-precision (64-bit) floating-point elements from a and b
6867// using mask, and store the results in dst.
6868// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
6869FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
6870{
6871 uint64x2_t mask =
6872 vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
6873#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6874 float64x2_t a = vreinterpretq_f64_m128d(_a);
6875 float64x2_t b = vreinterpretq_f64_m128d(_b);
6876 return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
6877#else
6878 uint64x2_t a = vreinterpretq_u64_m128d(_a);
6879 uint64x2_t b = vreinterpretq_u64_m128d(_b);
6880 return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
6881#endif
6882}
6883
6884// Blend packed single-precision (32-bit) floating-point elements from a and b
6885// using mask, and store the results in dst.
6886// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
6887FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
6888{
6889 // Use a signed shift right to create a mask with the sign bit
6890 uint32x4_t mask =
6891 vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
6892 float32x4_t a = vreinterpretq_f32_m128(_a);
6893 float32x4_t b = vreinterpretq_f32_m128(_b);
6894 return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
6895}
6896
6897// Round the packed double-precision (64-bit) floating-point elements in a up
6898// to an integer value, and store the results as packed double-precision
6899// floating-point elements in dst.
6900// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
6902{
6903#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6904 return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
6905#else
6906 double a0, a1;
6907 a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
6908 a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
6909 return _mm_set_pd(ceil(a1), ceil(a0));
6910#endif
6911}
6912
6913// Round the packed single-precision (32-bit) floating-point elements in a up to
6914// an integer value, and store the results as packed single-precision
6915// floating-point elements in dst.
6916// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
6918{
6919#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
6920 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
6921 return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
6922#else
6923 float *f = (float *) &a;
6924 return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
6925#endif
6926}
6927
6928// Round the lower double-precision (64-bit) floating-point element in b up to
6929// an integer value, store the result as a double-precision floating-point
6930// element in the lower element of dst, and copy the upper element from a to the
6931// upper element of dst.
6932// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
6934{
6935 return _mm_move_sd(a, _mm_ceil_pd(b));
6936}
6937
6938// Round the lower single-precision (32-bit) floating-point element in b up to
6939// an integer value, store the result as a single-precision floating-point
6940// element in the lower element of dst, and copy the upper 3 packed elements
6941// from a to the upper elements of dst.
6942// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
6944{
6945 return _mm_move_ss(a, _mm_ceil_ps(b));
6946}
6947
6948// Compare packed 64-bit integers in a and b for equality, and store the results
6949// in dst
6951{
6952#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
6955#else
6956 // ARMv7 lacks vceqq_u64
6957 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
6958 uint32x4_t cmp =
6960 uint32x4_t swapped = vrev64q_u32(cmp);
6961 return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
6962#endif
6963}
6964
6965// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store
6966// the results in dst.
6967// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32
6969{
6971 vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
6972}
6973
6974// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store
6975// the results in dst.
6976// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64
6978{
6979 int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */
6980 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
6981 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
6982 return vreinterpretq_m128i_s64(s64x2);
6983}
6984
6985// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store
6986// the results in dst.
6987// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64
6989{
6991 vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
6992}
6993
6994// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store
6995// the results in dst.
6996// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16
6998{
6999 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
7000 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
7001 return vreinterpretq_m128i_s16(s16x8);
7002}
7003
7004// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store
7005// the results in dst.
7006// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32
7008{
7009 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
7010 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
7011 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
7012 return vreinterpretq_m128i_s32(s32x4);
7013}
7014
7015// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit
7016// integers, and store the results in dst.
7017// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64
7019{
7020 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */
7021 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
7022 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
7023 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
7024 return vreinterpretq_m128i_s64(s64x2);
7025}
7026
7027// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers,
7028// and store the results in dst.
7029// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32
7031{
7033 vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
7034}
7035
7036// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers,
7037// and store the results in dst.
7038// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64
7040{
7041 uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */
7042 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
7043 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
7044 return vreinterpretq_m128i_u64(u64x2);
7045}
7046
7047// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers,
7048// and store the results in dst.
7049// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64
7051{
7053 vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
7054}
7055
7056// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
7057// and store the results in dst.
7058// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
7060{
7061 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx HGFE DCBA */
7062 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
7063 return vreinterpretq_m128i_u16(u16x8);
7064}
7065
7066// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers,
7067// and store the results in dst.
7068// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32
7070{
7071 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
7072 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
7073 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
7074 return vreinterpretq_m128i_u32(u32x4);
7075}
7076
7077// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed
7078// 64-bit integers, and store the results in dst.
7079// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64
7081{
7082 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */
7083 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
7084 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
7085 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
7086 return vreinterpretq_m128i_u64(u64x2);
7087}
7088
7089// Conditionally multiply the packed double-precision (64-bit) floating-point
7090// elements in a and b using the high 4 bits in imm8, sum the four products, and
7091// conditionally store the sum in dst using the low 4 bits of imm8.
7092// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
7093FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
7094{
7095 // Generate mask value from constant immediate bit value
7096 const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
7097 const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
7098#if !SSE2NEON_PRECISE_DP
7099 const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
7100 const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
7101#endif
7102 // Conditional multiplication
7103#if !SSE2NEON_PRECISE_DP
7104 __m128d mul = _mm_mul_pd(a, b);
7105 const __m128d mulMask =
7106 _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
7107 __m128d tmp = _mm_and_pd(mul, mulMask);
7108#else
7109#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
7110 double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
7111 vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
7112 : 0;
7113 double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
7114 vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
7115 : 0;
7116#else
7117 double a0 =
7118 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
7119 double a1 =
7120 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
7121 double b0 =
7122 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
7123 double b1 =
7124 sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
7125 double d0 = (imm & 0x10) ? a0 * b0 : 0;
7126 double d1 = (imm & 0x20) ? a1 * b1 : 0;
7127#endif
7128 __m128d tmp = _mm_set_pd(d1, d0);
7129#endif
7130 // Sum the products
7131#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
7132 double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
7133#else
7134 double _tmp0 = sse2neon_recast_u64_f64(
7135 vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 0));
7136 double _tmp1 = sse2neon_recast_u64_f64(
7137 vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 1));
7138 double sum = _tmp0 + _tmp1;
7139#endif
7140 // Conditionally store the sum
7141 const __m128d sumMask =
7142 _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
7143 __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
7144 return res;
7145}
7146
7147// Conditionally multiply the packed single-precision (32-bit) floating-point
7148// elements in a and b using the high 4 bits in imm8, sum the four products,
7149// and conditionally store the sum in dst using the low 4 bits of imm.
7150// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
7151FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
7152{
7153 float32x4_t elementwise_prod = _mm_mul_ps(a, b);
7154
7155#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
7156 /* shortcuts */
7157 if (imm == 0xFF) {
7158 return _mm_set1_ps(vaddvq_f32(elementwise_prod));
7159 }
7160
7161 if ((imm & 0x0F) == 0x0F) {
7162 if (!(imm & (1 << 4)))
7163 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 0);
7164 if (!(imm & (1 << 5)))
7165 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 1);
7166 if (!(imm & (1 << 6)))
7167 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 2);
7168 if (!(imm & (1 << 7)))
7169 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 3);
7170
7171 return _mm_set1_ps(vaddvq_f32(elementwise_prod));
7172 }
7173#endif
7174
7175 float s = 0.0f;
7176
7177 if (imm & (1 << 4))
7178 s += vgetq_lane_f32(elementwise_prod, 0);
7179 if (imm & (1 << 5))
7180 s += vgetq_lane_f32(elementwise_prod, 1);
7181 if (imm & (1 << 6))
7182 s += vgetq_lane_f32(elementwise_prod, 2);
7183 if (imm & (1 << 7))
7184 s += vgetq_lane_f32(elementwise_prod, 3);
7185
7186 const float32_t res[4] = {
7187 (imm & 0x1) ? s : 0.0f,
7188 (imm & 0x2) ? s : 0.0f,
7189 (imm & 0x4) ? s : 0.0f,
7190 (imm & 0x8) ? s : 0.0f,
7191 };
7192 return vreinterpretq_m128_f32(vld1q_f32(res));
7193}
7194
7195// Extract a 32-bit integer from a, selected with imm8, and store the result in
7196// dst.
7197// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32
7198// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
7199#define _mm_extract_epi32(a, imm) \
7200 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
7201
7202// Extract a 64-bit integer from a, selected with imm8, and store the result in
7203// dst.
7204// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64
7205// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
7206#define _mm_extract_epi64(a, imm) \
7207 vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
7208
7209// Extract an 8-bit integer from a, selected with imm8, and store the result in
7210// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a,
7211// __constrange(0,16) int imm)
7212// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
7213#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
7214
7215// Extracts the selected single-precision (32-bit) floating-point from a.
7216// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
7217#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
7218
7219// Round the packed double-precision (64-bit) floating-point elements in a down
7220// to an integer value, and store the results as packed double-precision
7221// floating-point elements in dst.
7222// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
7224{
7225#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
7226 return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
7227#else
7228 double a0, a1;
7229 a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
7230 a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
7231 return _mm_set_pd(floor(a1), floor(a0));
7232#endif
7233}
7234
7235// Round the packed single-precision (32-bit) floating-point elements in a down
7236// to an integer value, and store the results as packed single-precision
7237// floating-point elements in dst.
7238// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
7240{
7241#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
7242 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7243 return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
7244#else
7245 float *f = (float *) &a;
7246 return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
7247#endif
7248}
7249
7250// Round the lower double-precision (64-bit) floating-point element in b down to
7251// an integer value, store the result as a double-precision floating-point
7252// element in the lower element of dst, and copy the upper element from a to the
7253// upper element of dst.
7254// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
7256{
7257 return _mm_move_sd(a, _mm_floor_pd(b));
7258}
7259
7260// Round the lower single-precision (32-bit) floating-point element in b down to
7261// an integer value, store the result as a single-precision floating-point
7262// element in the lower element of dst, and copy the upper 3 packed elements
7263// from a to the upper elements of dst.
7264// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
7266{
7267 return _mm_move_ss(a, _mm_floor_ps(b));
7268}
7269
7270// Copy a to dst, and insert the 32-bit integer i into dst at the location
7271// specified by imm8.
7272// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32
7273// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
7274// __constrange(0,4) int imm)
7275#define _mm_insert_epi32(a, b, imm) \
7276 vreinterpretq_m128i_s32( \
7277 vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm)))
7278
7279// Copy a to dst, and insert the 64-bit integer i into dst at the location
7280// specified by imm8.
7281// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64
7282// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
7283// __constrange(0,2) int imm)
7284#define _mm_insert_epi64(a, b, imm) \
7285 vreinterpretq_m128i_s64( \
7286 vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm)))
7287
7288// Copy a to dst, and insert the lower 8-bit integer from i into dst at the
7289// location specified by imm8.
7290// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8
7291// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
7292// __constrange(0,16) int imm)
7293#define _mm_insert_epi8(a, b, imm) \
7294 vreinterpretq_m128i_s8(vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm)))
7295
7296// Copy a to tmp, then insert a single-precision (32-bit) floating-point
7297// element from b into tmp using the control in imm8. Store tmp to dst using
7298// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
7299// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
7300#define _mm_insert_ps(a, b, imm8) \
7301 _sse2neon_define2( \
7302 __m128, a, b, \
7303 float32x4_t tmp1 = \
7304 vsetq_lane_f32(vgetq_lane_f32(_b, ((imm8) >> 6) & 0x3), \
7305 vreinterpretq_f32_m128(_a), 0); \
7306 float32x4_t tmp2 = \
7307 vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), \
7308 vreinterpretq_f32_m128(_a), (((imm8) >> 4) & 0x3)); \
7309 const uint32_t data[4] = \
7310 _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
7311 ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
7312 ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
7313 ((imm8) & (1 << 3)) ? UINT32_MAX : 0); \
7314 uint32x4_t mask = vld1q_u32(data); \
7315 float32x4_t all_zeros = vdupq_n_f32(0); \
7316 \
7317 _sse2neon_return(vreinterpretq_m128_f32( \
7318 vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))));)
7319
7320// Compare packed signed 32-bit integers in a and b, and store packed maximum
7321// values in dst.
7322// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32
7324{
7327}
7328
7329// Compare packed signed 8-bit integers in a and b, and store packed maximum
7330// values in dst.
7331// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
7333{
7336}
7337
7338// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
7339// values in dst.
7340// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
7342{
7345}
7346
7347// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
7348// values in dst.
7349// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
7351{
7354}
7355
7356// Compare packed signed 32-bit integers in a and b, and store packed minimum
7357// values in dst.
7358// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32
7360{
7363}
7364
7365// Compare packed signed 8-bit integers in a and b, and store packed minimum
7366// values in dst.
7367// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
7369{
7372}
7373
7374// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
7375// values in dst.
7376// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
7378{
7381}
7382
7383// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
7384// values in dst.
7385// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
7387{
7390}
7391
7392// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
7393// in a, store the minimum and index in dst, and zero the remaining bits in dst.
7394// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
7396{
7397 __m128i dst;
7398 uint16_t min, idx = 0;
7399#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
7400 // Find the minimum value
7401 min = vminvq_u16(vreinterpretq_u16_m128i(a));
7402
7403 // Get the index of the minimum value
7404 static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
7405 uint16x8_t minv = vdupq_n_u16(min);
7406 uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a));
7407 idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
7408#else
7409 // Find the minimum value
7410 __m64 tmp;
7412 vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
7413 vget_high_u16(vreinterpretq_u16_m128i(a))));
7415 vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7417 vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7418 min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
7419 // Get the index of the minimum value
7420 int i;
7421 for (i = 0; i < 8; i++) {
7422 if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
7423 idx = (uint16_t) i;
7424 break;
7425 }
7426 a = _mm_srli_si128(a, 2);
7427 }
7428#endif
7429 // Generate result
7430 dst = _mm_setzero_si128();
7432 vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
7434 vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
7435 return dst;
7436}
7437
7438// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
7439// 8-bit integers in a compared to those in b, and store the 16-bit results in
7440// dst. Eight SADs are performed using one quadruplet from b and eight
7441// quadruplets from a. One quadruplet is selected from b starting at on the
7442// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
7443// integers selected from a starting at the offset specified in imm8.
7444// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
7445FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
7446{
7447 uint8x16_t _a, _b;
7448
7449 switch (imm & 0x4) {
7450 case 0:
7451 // do nothing
7452 _a = vreinterpretq_u8_m128i(a);
7453 break;
7454 case 4:
7455 _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
7457 break;
7458 default:
7459#if defined(__GNUC__) || defined(__clang__)
7460 __builtin_unreachable();
7461#elif defined(_MSC_VER)
7462 __assume(0);
7463#endif
7464 break;
7465 }
7466
7467 switch (imm & 0x3) {
7468 case 0:
7469 _b = vreinterpretq_u8_u32(
7470 vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
7471 break;
7472 case 1:
7473 _b = vreinterpretq_u8_u32(
7474 vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
7475 break;
7476 case 2:
7477 _b = vreinterpretq_u8_u32(
7478 vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
7479 break;
7480 case 3:
7481 _b = vreinterpretq_u8_u32(
7482 vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
7483 break;
7484 default:
7485#if defined(__GNUC__) || defined(__clang__)
7486 __builtin_unreachable();
7487#elif defined(_MSC_VER)
7488 __assume(0);
7489#endif
7490 break;
7491 }
7492
7493 int16x8_t c04, c15, c26, c37;
7494 uint8x8_t low_b = vget_low_u8(_b);
7495 c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
7496 uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
7497 c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
7498 uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
7499 c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
7500 uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
7501 c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
7502#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
7503 // |0|4|2|6|
7504 c04 = vpaddq_s16(c04, c26);
7505 // |1|5|3|7|
7506 c15 = vpaddq_s16(c15, c37);
7507
7508 int32x4_t trn1_c =
7509 vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
7510 int32x4_t trn2_c =
7511 vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
7512 return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
7513 vreinterpretq_s16_s32(trn2_c)));
7514#else
7515 int16x4_t c01, c23, c45, c67;
7516 c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
7517 c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
7518 c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
7519 c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
7520
7522 vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
7523#endif
7524}
7525
7526// Multiply the low signed 32-bit integers from each packed 64-bit element in
7527// a and b, and store the signed 64-bit results in dst.
7528// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32
7530{
7531 // vmull_s32 upcasts instead of masking, so we downcast.
7532 int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
7533 int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
7534 return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
7535}
7536
7537// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit
7538// integers, and store the low 32 bits of the intermediate integers in dst.
7539// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32
7541{
7544}
7545
7546// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
7547// using unsigned saturation, and store the results in dst.
7548// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32
7550{
7552 vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
7553 vqmovun_s32(vreinterpretq_s32_m128i(b))));
7554}
7555
7556// Round the packed double-precision (64-bit) floating-point elements in a using
7557// the rounding parameter, and store the results as packed double-precision
7558// floating-point elements in dst.
7559// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
7560FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
7561{
7562#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
7563 switch (rounding) {
7565 return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
7567 return _mm_floor_pd(a);
7569 return _mm_ceil_pd(a);
7571 return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
7572 default: //_MM_FROUND_CUR_DIRECTION
7573 return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
7574 }
7575#else
7576 double *v_double = (double *) &a;
7577
7578 if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
7579 (rounding == _MM_FROUND_CUR_DIRECTION &&
7581 double res[2], tmp;
7582 for (int i = 0; i < 2; i++) {
7583 tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
7584 double roundDown = floor(tmp); // Round down value
7585 double roundUp = ceil(tmp); // Round up value
7586 double diffDown = tmp - roundDown;
7587 double diffUp = roundUp - tmp;
7588 if (diffDown < diffUp) {
7589 /* If it's closer to the round down value, then use it */
7590 res[i] = roundDown;
7591 } else if (diffDown > diffUp) {
7592 /* If it's closer to the round up value, then use it */
7593 res[i] = roundUp;
7594 } else {
7595 /* If it's equidistant between round up and round down value,
7596 * pick the one which is an even number */
7597 double half = roundDown / 2;
7598 if (half != floor(half)) {
7599 /* If the round down value is odd, return the round up value
7600 */
7601 res[i] = roundUp;
7602 } else {
7603 /* If the round up value is odd, return the round down value
7604 */
7605 res[i] = roundDown;
7606 }
7607 }
7608 res[i] = (v_double[i] < 0) ? -res[i] : res[i];
7609 }
7610 return _mm_set_pd(res[1], res[0]);
7611 } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
7612 (rounding == _MM_FROUND_CUR_DIRECTION &&
7614 return _mm_floor_pd(a);
7615 } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
7616 (rounding == _MM_FROUND_CUR_DIRECTION &&
7618 return _mm_ceil_pd(a);
7619 }
7620 return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
7621 v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
7622#endif
7623}
7624
7625// Round the packed single-precision (32-bit) floating-point elements in a using
7626// the rounding parameter, and store the results as packed single-precision
7627// floating-point elements in dst.
7628// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
7629FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
7630{
7631#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
7632 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7633 switch (rounding) {
7635 return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
7637 return _mm_floor_ps(a);
7639 return _mm_ceil_ps(a);
7641 return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
7642 default: //_MM_FROUND_CUR_DIRECTION
7643 return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
7644 }
7645#else
7646 float *v_float = (float *) &a;
7647
7648 if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
7649 (rounding == _MM_FROUND_CUR_DIRECTION &&
7651 uint32x4_t signmask = vdupq_n_u32(0x80000000);
7652 float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
7653 vdupq_n_f32(0.5f)); /* +/- 0.5 */
7654 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
7655 vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
7656 int32x4_t r_trunc = vcvtq_s32_f32(
7657 vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
7658 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
7659 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
7660 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
7661 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
7662 float32x4_t delta = vsubq_f32(
7664 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
7665 uint32x4_t is_delta_half =
7666 vceqq_f32(delta, half); /* delta == +/- 0.5 */
7668 vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
7669 } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
7670 (rounding == _MM_FROUND_CUR_DIRECTION &&
7672 return _mm_floor_ps(a);
7673 } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
7674 (rounding == _MM_FROUND_CUR_DIRECTION &&
7676 return _mm_ceil_ps(a);
7677 }
7678 return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
7679 v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
7680 v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
7681 v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
7682#endif
7683}
7684
7685// Round the lower double-precision (64-bit) floating-point element in b using
7686// the rounding parameter, store the result as a double-precision floating-point
7687// element in the lower element of dst, and copy the upper element from a to the
7688// upper element of dst.
7689// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
7690FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
7691{
7692 return _mm_move_sd(a, _mm_round_pd(b, rounding));
7693}
7694
7695// Round the lower single-precision (32-bit) floating-point element in b using
7696// the rounding parameter, store the result as a single-precision floating-point
7697// element in the lower element of dst, and copy the upper 3 packed elements
7698// from a to the upper elements of dst. Rounding is done according to the
7699// rounding[3:0] parameter, which can be one of:
7700// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
7701// suppress exceptions
7702// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and
7703// suppress exceptions
7704// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress
7705// exceptions
7706// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress
7707// exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
7708// _MM_SET_ROUNDING_MODE
7709// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
7710FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
7711{
7712 return _mm_move_ss(a, _mm_round_ps(b, rounding));
7713}
7714
7715// Load 128-bits of integer data from memory into dst using a non-temporal
7716// memory hint. mem_addr must be aligned on a 16-byte boundary or a
7717// general-protection exception may be generated.
7718// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
7720{
7721#if __has_builtin(__builtin_nontemporal_store)
7722 return __builtin_nontemporal_load(p);
7723#else
7724 return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
7725#endif
7726}
7727
7728// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
7729// all 1's, and return 1 if the result is zero, otherwise return 0.
7730// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
7731FORCE_INLINE int _mm_test_all_ones(__m128i a)
7732{
7733 return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
7734 ~(uint64_t) 0;
7735}
7736
7737// Compute the bitwise AND of 128 bits (representing integer data) in a and
7738// mask, and return 1 if the result is zero, otherwise return 0.
7739// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
7740FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
7741{
7742 int64x2_t a_and_mask =
7744 return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
7745}
7746
7747// Compute the bitwise AND of 128 bits (representing integer data) in a and
7748// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
7749// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
7750// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
7751// otherwise return 0.
7752// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
7753// Note: Argument names may be wrong in the Intel intrinsics guide.
7754FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
7755{
7756 uint64x2_t v = vreinterpretq_u64_m128i(a);
7757 uint64x2_t m = vreinterpretq_u64_m128i(mask);
7758
7759 // find ones (set-bits) and zeros (clear-bits) under clip mask
7760 uint64x2_t ones = vandq_u64(m, v);
7761 uint64x2_t zeros = vbicq_u64(m, v);
7762
7763 // If both 128-bit variables are populated (non-zero) then return 1.
7764 // For comparison purposes, first compact each var down to 32-bits.
7765 uint32x2_t reduced = vpmax_u32(vqmovn_u64(ones), vqmovn_u64(zeros));
7766
7767 // if folding minimum is non-zero then both vars must be non-zero
7768 return (vget_lane_u32(vpmin_u32(reduced, reduced), 0) != 0);
7769}
7770
7771// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7772// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7773// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7774// otherwise set CF to 0. Return the CF value.
7775// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
7776FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
7777{
7778 int64x2_t s64_vec =
7780 return !(vgetq_lane_s64(s64_vec, 0) | vgetq_lane_s64(s64_vec, 1));
7781}
7782
7783// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7784// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7785// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7786// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
7787// otherwise return 0.
7788// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
7789#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
7790
7791// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7792// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7793// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7794// otherwise set CF to 0. Return the ZF value.
7795// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
7796FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
7797{
7798 int64x2_t s64_vec =
7800 return !(vgetq_lane_s64(s64_vec, 0) | vgetq_lane_s64(s64_vec, 1));
7801}
7802
7803/* SSE4.2 */
7804
7805static const uint16_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask16b[8] = {
7806 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
7807};
7808static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = {
7809 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
7810 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
7811};
7812
7813/* specify the source data format */
7814#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */
7815#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */
7816#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */
7817#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */
7818
7819/* specify the comparison operation */
7820#define _SIDD_CMP_EQUAL_ANY 0x00 /* compare equal any: strchr */
7821#define _SIDD_CMP_RANGES 0x04 /* compare ranges */
7822#define _SIDD_CMP_EQUAL_EACH 0x08 /* compare equal each: strcmp */
7823#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */
7824
7825/* specify the polarity */
7826#define _SIDD_POSITIVE_POLARITY 0x00
7827#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
7828#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */
7829#define _SIDD_MASKED_NEGATIVE_POLARITY \
7830 0x30 /* negate results only before end of string */
7831
7832/* specify the output selection in _mm_cmpXstri */
7833#define _SIDD_LEAST_SIGNIFICANT 0x00
7834#define _SIDD_MOST_SIGNIFICANT 0x40
7835
7836/* specify the output selection in _mm_cmpXstrm */
7837#define _SIDD_BIT_MASK 0x00
7838#define _SIDD_UNIT_MASK 0x40
7839
7840/* Pattern Matching for C macros.
7841 * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
7842 */
7843
7844/* catenate */
7845#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
7846#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)
7847
7848#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)
7849/* run the 2nd parameter */
7850#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__
7851/* run the 1st parameter */
7852#define SSE2NEON_IIF_1(t, ...) t
7853
7854#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)
7855#define SSE2NEON_COMPL_0 1
7856#define SSE2NEON_COMPL_1 0
7857
7858#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)
7859#define SSE2NEON_DEC_1 0
7860#define SSE2NEON_DEC_2 1
7861#define SSE2NEON_DEC_3 2
7862#define SSE2NEON_DEC_4 3
7863#define SSE2NEON_DEC_5 4
7864#define SSE2NEON_DEC_6 5
7865#define SSE2NEON_DEC_7 6
7866#define SSE2NEON_DEC_8 7
7867#define SSE2NEON_DEC_9 8
7868#define SSE2NEON_DEC_10 9
7869#define SSE2NEON_DEC_11 10
7870#define SSE2NEON_DEC_12 11
7871#define SSE2NEON_DEC_13 12
7872#define SSE2NEON_DEC_14 13
7873#define SSE2NEON_DEC_15 14
7874#define SSE2NEON_DEC_16 15
7875
7876/* detection */
7877#define SSE2NEON_CHECK_N(x, n, ...) n
7878#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )
7879#define SSE2NEON_PROBE(x) x, 1,
7880
7881#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))
7882#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)
7883
7884#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))
7885#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))
7886
7887#define SSE2NEON_EAT(...)
7888#define SSE2NEON_EXPAND(...) __VA_ARGS__
7889#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)
7890
7891/* recursion */
7892/* deferred expression */
7893#define SSE2NEON_EMPTY()
7894#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()
7895#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()
7896#define SSE2NEON_EXPAND(...) __VA_ARGS__
7897
7898#define SSE2NEON_EVAL(...) \
7899 SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))
7900#define SSE2NEON_EVAL1(...) \
7901 SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))
7902#define SSE2NEON_EVAL2(...) \
7903 SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))
7904#define SSE2NEON_EVAL3(...) __VA_ARGS__
7905
7906#define SSE2NEON_REPEAT(count, macro, ...) \
7907 SSE2NEON_WHEN(count) \
7908 (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()( \
7909 SSE2NEON_DEC(count), macro, \
7910 __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \
7911 __VA_ARGS__))
7912#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT
7913
7914#define SSE2NEON_SIZE_OF_byte 8
7915#define SSE2NEON_NUMBER_OF_LANES_byte 16
7916#define SSE2NEON_SIZE_OF_word 16
7917#define SSE2NEON_NUMBER_OF_LANES_word 8
7918
7919#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type) \
7920 mtx[i] = vreinterpretq_m128i_##type(vceqq_##type( \
7921 vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \
7922 vreinterpretq_##type##_m128i(a)));
7923
7924#define SSE2NEON_FILL_LANE(i, type) \
7925 vec_b[i] = \
7926 vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
7927
7928#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size, \
7929 number_of_lanes, byte_or_word) \
7930 do { \
7931 SSE2NEON_CAT( \
7932 data_type_prefix, \
7933 SSE2NEON_CAT(size, \
7934 SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \
7935 vec_b[number_of_lanes]; \
7936 __m128i mask = SSE2NEON_IIF(byte_or_word)( \
7937 vreinterpretq_m128i_u16(vdupq_n_u16(0xff)), \
7938 vreinterpretq_m128i_u32(vdupq_n_u32(0xffff))); \
7939 SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE, \
7940 SSE2NEON_CAT(type_prefix, size))) \
7941 for (int i = 0; i < number_of_lanes; i++) { \
7942 mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u, \
7943 size)(SSE2NEON_CAT(vbslq_u, size)( \
7944 SSE2NEON_CAT(vreinterpretq_u, \
7945 SSE2NEON_CAT(size, _m128i))(mask), \
7946 SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))( \
7947 vec_b[i], \
7948 SSE2NEON_CAT( \
7949 vreinterpretq_, \
7950 SSE2NEON_CAT(type_prefix, \
7951 SSE2NEON_CAT(size, _m128i(a))))), \
7952 SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))( \
7953 vec_b[i], \
7954 SSE2NEON_CAT( \
7955 vreinterpretq_, \
7956 SSE2NEON_CAT(type_prefix, \
7957 SSE2NEON_CAT(size, _m128i(a))))))); \
7958 } \
7959 } while (0)
7960
7961#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes) \
7962 do { \
7963 SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, \
7964 SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \
7965 SSE2NEON_CAT(u, size))) \
7966 } while (0)
7967
7968#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type) \
7969 static uint16_t _sse2neon_cmp_##type##_equal_any(__m128i a, int la, \
7970 __m128i b, int lb) \
7971 { \
7972 __m128i mtx[16]; \
7973 PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7974 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \
7975 return SSE2NEON_CAT( \
7976 _sse2neon_aggregate_equal_any_, \
7977 SSE2NEON_CAT( \
7978 SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7979 SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \
7980 type))))(la, lb, mtx); \
7981 }
7982
7983#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word) \
7984 static uint16_t _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, \
7985 __m128i b, int lb) \
7986 { \
7987 __m128i mtx[16]; \
7988 PCMPSTR_RANGES( \
7989 a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7990 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word); \
7991 return SSE2NEON_CAT( \
7992 _sse2neon_aggregate_ranges_, \
7993 SSE2NEON_CAT( \
7994 SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7995 SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \
7996 type))))(la, lb, mtx); \
7997 }
7998
7999#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type) \
8000 static uint16_t _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la, \
8001 __m128i b, int lb) \
8002 { \
8003 __m128i mtx[16]; \
8004 PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
8005 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \
8006 return SSE2NEON_CAT( \
8007 _sse2neon_aggregate_equal_ordered_, \
8008 SSE2NEON_CAT( \
8009 SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
8010 SSE2NEON_CAT(x, \
8011 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \
8012 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx); \
8013 }
8014
8015static uint16_t _sse2neon_aggregate_equal_any_8x16(int la,
8016 int lb,
8017 __m128i mtx[16])
8018{
8019 uint16_t res = 0;
8020 int m = (1 << la) - 1;
8021 uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
8022 uint8x8_t t_lo = vtst_u8(vdup_n_u8((uint8_t) (m & 0xff)), vec_mask);
8023 uint8x8_t t_hi = vtst_u8(vdup_n_u8((uint8_t) (m >> 8)), vec_mask);
8024 uint8x16_t vec = vcombine_u8(t_lo, t_hi);
8025 for (int j = 0; j < lb; j++) {
8026 mtx[j] = vreinterpretq_m128i_u8(
8027 vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
8028 mtx[j] = vreinterpretq_m128i_u8(
8029 vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
8030 uint16_t tmp =
8032 res |= (tmp << j);
8033 }
8034 return res;
8035}
8036
8037static uint16_t _sse2neon_aggregate_equal_any_16x8(int la,
8038 int lb,
8039 __m128i mtx[16])
8040{
8041 uint16_t res = 0;
8042 uint16_t m = (uint16_t) (1 << la) - 1;
8043 uint16x8_t vec =
8044 vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
8045 for (int j = 0; j < lb; j++) {
8046 mtx[j] = vreinterpretq_m128i_u16(
8047 vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
8048 mtx[j] = vreinterpretq_m128i_u16(
8049 vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
8050 uint16_t tmp =
8052 res |= (tmp << j);
8053 }
8054 return res;
8055}
8056
8057/* clang-format off */
8058#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \
8059 prefix##IMPL(byte) \
8060 prefix##IMPL(word)
8061/* clang-format on */
8062
8063SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
8064
8065static uint16_t _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
8066{
8067 uint16_t res = 0;
8068 uint16_t m = (uint16_t) (1 << la) - 1;
8069 uint16x8_t vec =
8070 vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
8071 for (int j = 0; j < lb; j++) {
8072 mtx[j] = vreinterpretq_m128i_u16(
8073 vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
8074 mtx[j] = vreinterpretq_m128i_u16(
8075 vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
8077 vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
8078 uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
8080#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
8081 uint16_t t = vaddvq_u32(vec_res) ? 1 : 0;
8082#else
8083 uint64x2_t sumh = vpaddlq_u32(vec_res);
8084 uint16_t t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
8085#endif
8086 res |= (t << j);
8087 }
8088 return res;
8089}
8090
8091static uint16_t _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
8092{
8093 uint16_t res = 0;
8094 uint16_t m = (uint16_t) ((1 << la) - 1);
8095 uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
8096 uint8x8_t t_lo = vtst_u8(vdup_n_u8((uint8_t) (m & 0xff)), vec_mask);
8097 uint8x8_t t_hi = vtst_u8(vdup_n_u8((uint8_t) (m >> 8)), vec_mask);
8098 uint8x16_t vec = vcombine_u8(t_lo, t_hi);
8099 for (int j = 0; j < lb; j++) {
8100 mtx[j] = vreinterpretq_m128i_u8(
8101 vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
8102 mtx[j] = vreinterpretq_m128i_u8(
8103 vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
8105 vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
8106 uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
8108 uint16_t t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
8109 res |= (t << j);
8110 }
8111 return res;
8112}
8113
8114#define SSE2NEON_CMP_RANGES_IS_BYTE 1
8115#define SSE2NEON_CMP_RANGES_IS_WORD 0
8116
8117/* clang-format off */
8118#define SSE2NEON_GENERATE_CMP_RANGES(prefix) \
8119 prefix##IMPL(byte, uint, u, prefix##IS_BYTE) \
8120 prefix##IMPL(byte, int, s, prefix##IS_BYTE) \
8121 prefix##IMPL(word, uint, u, prefix##IS_WORD) \
8122 prefix##IMPL(word, int, s, prefix##IS_WORD)
8123/* clang-format on */
8124
8125SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
8126
8127#undef SSE2NEON_CMP_RANGES_IS_BYTE
8128#undef SSE2NEON_CMP_RANGES_IS_WORD
8129
8130static uint16_t _sse2neon_cmp_byte_equal_each(__m128i a,
8131 int la,
8132 __m128i b,
8133 int lb)
8134{
8135 uint8x16_t mtx =
8137 uint16_t m0 = (la < lb) ? 0 : (uint16_t) ((1 << la) - (1 << lb));
8138 uint16_t m1 = (uint16_t) (0x10000 - (1 << la));
8139 uint16_t tb = (uint16_t) (0x10000 - (1 << lb));
8140 uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
8141 uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
8142 vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
8143 vec0_lo = vtst_u8(vdup_n_u8((uint8_t) m0), vec_mask);
8144 vec0_hi = vtst_u8(vdup_n_u8((uint8_t) (m0 >> 8)), vec_mask);
8145 vec1_lo = vtst_u8(vdup_n_u8((uint8_t) m1), vec_mask);
8146 vec1_hi = vtst_u8(vdup_n_u8((uint8_t) (m1 >> 8)), vec_mask);
8147 tmp_lo = vtst_u8(vdup_n_u8((uint8_t) tb), vec_mask);
8148 tmp_hi = vtst_u8(vdup_n_u8((uint8_t) (tb >> 8)), vec_mask);
8149
8150 res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
8151 res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
8152 res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
8153 res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
8154 res_lo = vand_u8(res_lo, vec_mask);
8155 res_hi = vand_u8(res_hi, vec_mask);
8156
8157 return _sse2neon_vaddv_u8(res_lo) +
8158 (uint16_t) (_sse2neon_vaddv_u8(res_hi) << 8);
8159}
8160
8161static uint16_t _sse2neon_cmp_word_equal_each(__m128i a,
8162 int la,
8163 __m128i b,
8164 int lb)
8165{
8166 uint16x8_t mtx =
8168 uint16_t m0 = (uint16_t) ((la < lb) ? 0 : ((1 << la) - (1 << lb)));
8169 uint16_t m1 = (uint16_t) (0x100 - (1 << la));
8170 uint16_t tb = (uint16_t) (0x100 - (1 << lb));
8171 uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
8172 uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
8173 uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
8174 uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
8175 mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
8176 mtx = vbslq_u16(vec1, tmp, mtx);
8177 mtx = vandq_u16(mtx, vec_mask);
8178 return _sse2neon_vaddvq_u16(mtx);
8179}
8180
8181#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
8182#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
8183
8184#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type) \
8185 static uint16_t \
8186 _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes( \
8187 int bound, int la, int lb, __m128i mtx[16]) \
8188 { \
8189 uint16_t res = 0; \
8190 uint16_t m1 = \
8191 (uint16_t) (SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la)); \
8192 uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)( \
8193 vld1_u##size(_sse2neon_cmpestr_mask##size##b), \
8194 vld1q_u##size(_sse2neon_cmpestr_mask##size##b)); \
8195 uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)( \
8196 vcombine_u##size( \
8197 vtst_u##size(vdup_n_u##size((uint##size##_t) m1), vec_mask), \
8198 vtst_u##size(vdup_n_u##size((uint##size##_t)(m1 >> 8)), \
8199 vec_mask)), \
8200 vtstq_u##size(vdupq_n_u##size((uint##size##_t) m1), vec_mask)); \
8201 uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
8202 uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0); \
8203 for (int j = 0; j < lb; j++) { \
8204 mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size( \
8205 vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j]))); \
8206 } \
8207 for (int j = lb; j < bound; j++) { \
8208 mtx[j] = vreinterpretq_m128i_u##size( \
8209 vbslq_u##size(vec1, vec_minusone, vec_zero)); \
8210 } \
8211 unsigned SSE2NEON_IIF(data_type)(char, short) *ptr = \
8212 (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx; \
8213 for (int i = 0; i < bound; i++) { \
8214 int val = 1; \
8215 for (int j = 0, k = i; j < bound - i && k < bound; j++, k++) \
8216 val &= ptr[k * bound + j]; \
8217 res += (uint16_t) (val << i); \
8218 } \
8219 return res; \
8220 }
8221
8222/* clang-format off */
8223#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
8224 prefix##IMPL(8, 16, prefix##IS_UBYTE) \
8225 prefix##IMPL(16, 8, prefix##IS_UWORD)
8226/* clang-format on */
8227
8228SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)
8229
8230#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
8231#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
8232
8233/* clang-format off */
8234#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
8235 prefix##IMPL(byte) \
8236 prefix##IMPL(word)
8237/* clang-format on */
8238
8239SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_)
8240
8241#define SSE2NEON_CMPESTR_LIST \
8242 _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any) \
8243 _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any) \
8244 _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any) \
8245 _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any) \
8246 _(CMP_UBYTE_RANGES, cmp_ubyte_ranges) \
8247 _(CMP_UWORD_RANGES, cmp_uword_ranges) \
8248 _(CMP_SBYTE_RANGES, cmp_sbyte_ranges) \
8249 _(CMP_SWORD_RANGES, cmp_sword_ranges) \
8250 _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each) \
8251 _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each) \
8252 _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each) \
8253 _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each) \
8254 _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
8255 _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
8256 _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
8257 _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
8258
8259enum {
8260#define _(name, func_suffix) name,
8262#undef _
8263};
8264typedef uint16_t (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
8265static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
8266#define _(name, func_suffix) _sse2neon_##func_suffix,
8268#undef _
8269};
8270
8271FORCE_INLINE uint16_t _sse2neon_sido_negative(int res,
8272 int lb,
8273 int imm8,
8274 int bound)
8275{
8276 switch (imm8 & 0x30) {
8278 res ^= 0xffffffff;
8279 break;
8281 res ^= (1 << lb) - 1;
8282 break;
8283 default:
8284 break;
8285 }
8286
8287 return (uint16_t) (res & ((bound == 8) ? 0xFF : 0xFFFF));
8288}
8289
8290FORCE_INLINE int _sse2neon_clz(unsigned int x)
8291{
8292#if defined(_MSC_VER) && !defined(__clang__)
8293 unsigned long cnt = 0;
8294 if (_BitScanReverse(&cnt, x))
8295 return 31 - cnt;
8296 return 32;
8297#else
8298 return x != 0 ? __builtin_clz(x) : 32;
8299#endif
8300}
8301
8302FORCE_INLINE int _sse2neon_ctz(unsigned int x)
8303{
8304#if defined(_MSC_VER) && !defined(__clang__)
8305 unsigned long cnt = 0;
8306 if (_BitScanForward(&cnt, x))
8307 return cnt;
8308 return 32;
8309#else
8310 return x != 0 ? __builtin_ctz(x) : 32;
8311#endif
8312}
8313
8314FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
8315{
8316#ifdef _MSC_VER
8317 unsigned long cnt;
8318#if defined(SSE2NEON_HAS_BITSCAN64)
8319 if (_BitScanForward64(&cnt, x))
8320 return (int) (cnt);
8321#else
8322 if (_BitScanForward(&cnt, (unsigned long) (x)))
8323 return (int) cnt;
8324 if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
8325 return (int) (cnt + 32);
8326#endif /* SSE2NEON_HAS_BITSCAN64 */
8327 return 64;
8328#else /* assume GNU compatible compilers */
8329 return x != 0 ? __builtin_ctzll(x) : 64;
8330#endif
8331}
8332
8333#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
8334
8335#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
8336 const int var = ((imm) & 0x01) ? 8 : 16
8337
8338#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
8339 int tmp1 = la ^ (la >> 31); \
8340 la = tmp1 - (la >> 31); \
8341 int tmp2 = lb ^ (lb >> 31); \
8342 lb = tmp2 - (lb >> 31); \
8343 la = SSE2NEON_MIN(la, bound); \
8344 lb = SSE2NEON_MIN(lb, bound)
8345
8346// Compare all pairs of character in string a and b,
8347// then aggregate the result.
8348// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
8349// length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
8350// string a and b.
8351#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE) \
8352 SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); \
8353 SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb); \
8354 uint16_t r2 = (_sse2neon_cmpfunc_table[(imm8) & 0x0f])(a, la, b, lb); \
8355 r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
8356
8357#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8) \
8358 return (r2 == 0) ? bound \
8359 : (((imm8) & 0x40) ? (31 - _sse2neon_clz(r2)) \
8360 : _sse2neon_ctz(r2))
8361
8362#define SSE2NEON_CMPSTR_GENERATE_MASK(dst) \
8363 __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
8364 if ((imm8) & 0x40) { \
8365 if (bound == 8) { \
8366 uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2), \
8367 vld1q_u16(_sse2neon_cmpestr_mask16b)); \
8368 dst = vreinterpretq_m128i_u16(vbslq_u16( \
8369 tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst))); \
8370 } else { \
8371 uint8x16_t vec_r2 = vcombine_u8(vdup_n_u8((uint8_t) r2), \
8372 vdup_n_u8((uint8_t) (r2 >> 8))); \
8373 uint8x16_t tmp = \
8374 vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b)); \
8375 dst = vreinterpretq_m128i_u8( \
8376 vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst))); \
8377 } \
8378 } else { \
8379 if (bound == 16) { \
8380 dst = vreinterpretq_m128i_u16( \
8381 vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
8382 } else { \
8383 dst = vreinterpretq_m128i_u8(vsetq_lane_u8( \
8384 (uint8_t) (r2 & 0xff), vreinterpretq_u8_m128i(dst), 0)); \
8385 } \
8386 } \
8387 return dst
8388
8389// Compare packed strings in a and b with lengths la and lb using the control
8390// in imm8, and returns 1 if b did not contain a null character and the
8391// resulting mask was zero, and 0 otherwise.
8392// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra
8393FORCE_INLINE int _mm_cmpestra(__m128i a,
8394 int la,
8395 __m128i b,
8396 int lb,
8397 const int imm8)
8398{
8399 int lb_cpy = lb;
8400 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8401 return !r2 & (lb_cpy > bound);
8402}
8403
8404// Compare packed strings in a and b with lengths la and lb using the control in
8405// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
8406// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc
8407FORCE_INLINE int _mm_cmpestrc(__m128i a,
8408 int la,
8409 __m128i b,
8410 int lb,
8411 const int imm8)
8412{
8413 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8414 return r2 != 0;
8415}
8416
8417// Compare packed strings in a and b with lengths la and lb using the control
8418// in imm8, and store the generated index in dst.
8419// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
8420FORCE_INLINE int _mm_cmpestri(__m128i a,
8421 int la,
8422 __m128i b,
8423 int lb,
8424 const int imm8)
8425{
8426 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8427 SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
8428}
8429
8430// Compare packed strings in a and b with lengths la and lb using the control
8431// in imm8, and store the generated mask in dst.
8432// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
8433FORCE_INLINE __m128i
8434_mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
8435{
8436 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8438}
8439
8440// Compare packed strings in a and b with lengths la and lb using the control in
8441// imm8, and returns bit 0 of the resulting bit mask.
8442// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro
8443FORCE_INLINE int _mm_cmpestro(__m128i a,
8444 int la,
8445 __m128i b,
8446 int lb,
8447 const int imm8)
8448{
8449 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8450 return r2 & 1;
8451}
8452
8453// Compare packed strings in a and b with lengths la and lb using the control in
8454// imm8, and returns 1 if any character in a was null, and 0 otherwise.
8455// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs
8456FORCE_INLINE int _mm_cmpestrs(__m128i a,
8457 int la,
8458 __m128i b,
8459 int lb,
8460 const int imm8)
8461{
8462 (void) a;
8463 (void) b;
8464 (void) lb;
8465 SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
8466 return la <= (bound - 1);
8467}
8468
8469// Compare packed strings in a and b with lengths la and lb using the control in
8470// imm8, and returns 1 if any character in b was null, and 0 otherwise.
8471// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz
8472FORCE_INLINE int _mm_cmpestrz(__m128i a,
8473 int la,
8474 __m128i b,
8475 int lb,
8476 const int imm8)
8477{
8478 (void) a;
8479 (void) b;
8480 (void) la;
8481 SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
8482 return lb <= (bound - 1);
8483}
8484
8485#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8) \
8486 do { \
8487 if ((imm8) & 0x01) { \
8488 uint16x8_t equal_mask_##str = \
8489 vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
8490 uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \
8491 uint64_t matches_##str = \
8492 vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \
8493 len = _sse2neon_ctzll(matches_##str) >> 3; \
8494 } else { \
8495 uint16x8_t equal_mask_##str = vreinterpretq_u16_u8( \
8496 vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0))); \
8497 uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \
8498 uint64_t matches_##str = \
8499 vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \
8500 len = _sse2neon_ctzll(matches_##str) >> 2; \
8501 } \
8502 } while (0)
8503
8504#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \
8505 int la, lb; \
8506 do { \
8507 SSE2NEON_CMPISTRX_LENGTH(a, la, imm8); \
8508 SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8); \
8509 } while (0)
8510
8511// Compare packed strings with implicit lengths in a and b using the control in
8512// imm8, and returns 1 if b did not contain a null character and the resulting
8513// mask was zero, and 0 otherwise.
8514// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra
8515FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
8516{
8517 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8518 return !r2 & (lb >= bound);
8519}
8520
8521// Compare packed strings with implicit lengths in a and b using the control in
8522// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
8523// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc
8524FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
8525{
8526 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8527 return r2 != 0;
8528}
8529
8530// Compare packed strings with implicit lengths in a and b using the control in
8531// imm8, and store the generated index in dst.
8532// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri
8533FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
8534{
8535 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8536 SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
8537}
8538
8539// Compare packed strings with implicit lengths in a and b using the control in
8540// imm8, and store the generated mask in dst.
8541// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm
8542FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
8543{
8544 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8546}
8547
8548// Compare packed strings with implicit lengths in a and b using the control in
8549// imm8, and returns bit 0 of the resulting bit mask.
8550// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro
8551FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
8552{
8553 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8554 return r2 & 1;
8555}
8556
8557// Compare packed strings with implicit lengths in a and b using the control in
8558// imm8, and returns 1 if any character in a was null, and 0 otherwise.
8559// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs
8560FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
8561{
8562 (void) b;
8563 SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
8564 int la;
8565 SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);
8566 return la <= (bound - 1);
8567}
8568
8569// Compare packed strings with implicit lengths in a and b using the control in
8570// imm8, and returns 1 if any character in b was null, and 0 otherwise.
8571// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz
8572FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
8573{
8574 (void) a;
8575 SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
8576 int lb;
8577 SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);
8578 return lb <= (bound - 1);
8579}
8580
8581// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
8582// in b for greater than.
8584{
8585#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
8588#else
8589 return vreinterpretq_m128i_s64(vshrq_n_s64(
8591 63));
8592#endif
8593}
8594
8595// Starting with the initial value in crc, accumulates a CRC32 value for
8596// unsigned 16-bit integer v, and stores the result in dst.
8597// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16
8598FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
8599{
8600#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8601 __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
8602 : [c] "+r"(crc)
8603 : [v] "r"(v));
8604#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
8605 ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__))
8606 crc = __crc32ch(crc, v);
8607#else
8608 crc = _mm_crc32_u8(crc, (uint8_t) (v & 0xff));
8609 crc = _mm_crc32_u8(crc, (uint8_t) ((v >> 8) & 0xff));
8610#endif
8611 return crc;
8612}
8613
8614// Starting with the initial value in crc, accumulates a CRC32 value for
8615// unsigned 32-bit integer v, and stores the result in dst.
8616// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32
8617FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
8618{
8619#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8620 __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
8621 : [c] "+r"(crc)
8622 : [v] "r"(v));
8623#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
8624 ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__))
8625 crc = __crc32cw(crc, v);
8626#else
8627 crc = _mm_crc32_u16(crc, (uint16_t) (v & 0xffff));
8628 crc = _mm_crc32_u16(crc, (uint16_t) ((v >> 16) & 0xffff));
8629#endif
8630 return crc;
8631}
8632
8633// Starting with the initial value in crc, accumulates a CRC32 value for
8634// unsigned 64-bit integer v, and stores the result in dst.
8635// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64
8636FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
8637{
8638#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8639 __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
8640 : [c] "+r"(crc)
8641 : [v] "r"(v));
8642#elif ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__))
8643 crc = __crc32cd((uint32_t) crc, v);
8644#else
8645 crc = _mm_crc32_u32((uint32_t) (crc), (uint32_t) (v & 0xffffffff));
8646 crc = _mm_crc32_u32((uint32_t) (crc), (uint32_t) ((v >> 32) & 0xffffffff));
8647#endif
8648 return crc;
8649}
8650
8651// Starting with the initial value in crc, accumulates a CRC32 value for
8652// unsigned 8-bit integer v, and stores the result in dst.
8653// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8
8654FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
8655{
8656#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8657 __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
8658 : [c] "+r"(crc)
8659 : [v] "r"(v));
8660#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
8661 ((defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(__clang__))
8662 crc = __crc32cb(crc, v);
8663#else
8664 crc ^= v;
8665#if defined(__ARM_FEATURE_CRYPTO)
8666 // Adapted from: https://mary.rs/lab/crc32/
8667 // Barrent reduction
8668 uint64x2_t orig =
8669 vcombine_u64(vcreate_u64((uint64_t) (crc) << 24), vcreate_u64(0x0));
8670 uint64x2_t tmp = orig;
8671
8672 // Polynomial P(x) of CRC32C
8673 uint64_t p = 0x105EC76F1;
8674 // Barrett Reduction (in bit-reflected form) constant mu_{64} = \lfloor
8675 // 2^{64} / P(x) \rfloor = 0x11f91caf6
8676 uint64_t mu = 0x1dea713f1;
8677
8678 // Multiply by mu_{64}
8679 tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(mu));
8680 // Divide by 2^{64} (mask away the unnecessary bits)
8681 tmp =
8682 vandq_u64(tmp, vcombine_u64(vcreate_u64(0xFFFFFFFF), vcreate_u64(0x0)));
8683 // Multiply by P(x) (shifted left by 1 for alignment reasons)
8684 tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(p));
8685 // Subtract original from result
8686 tmp = veorq_u64(tmp, orig);
8687
8688 // Extract the 'lower' (in bit-reflected sense) 32 bits
8689 crc = vgetq_lane_u32(vreinterpretq_u32_u64(tmp), 1);
8690#else // Fall back to the generic table lookup approach
8691 // Adapted from: https://create.stephan-brumme.com/crc32/
8692 // Apply half-byte comparison algorithm for the best ratio between
8693 // performance and lookup table.
8694
8695 // The lookup table just needs to store every 16th entry
8696 // of the standard look-up table.
8697 static const uint32_t crc32_half_byte_tbl[] = {
8698 0x00000000, 0x105ec76f, 0x20bd8ede, 0x30e349b1, 0x417b1dbc, 0x5125dad3,
8699 0x61c69362, 0x7198540d, 0x82f63b78, 0x92a8fc17, 0xa24bb5a6, 0xb21572c9,
8700 0xc38d26c4, 0xd3d3e1ab, 0xe330a81a, 0xf36e6f75,
8701 };
8702
8703 crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F];
8704 crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F];
8705#endif
8706#endif
8707 return crc;
8708}
8709
8710/* AES */
8711
8712#if !defined(__ARM_FEATURE_CRYPTO) && \
8713 ((!defined(_M_ARM64) && !defined(_M_ARM64EC)) || defined(__clang__))
8714/* clang-format off */
8715#define SSE2NEON_AES_SBOX(w) \
8716 { \
8717 w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
8718 w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
8719 w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
8720 w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
8721 w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
8722 w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
8723 w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
8724 w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
8725 w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
8726 w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
8727 w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
8728 w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
8729 w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
8730 w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
8731 w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
8732 w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
8733 w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
8734 w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
8735 w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
8736 w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
8737 w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
8738 w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
8739 w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
8740 w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
8741 w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
8742 w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
8743 w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
8744 w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
8745 w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
8746 w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
8747 w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
8748 w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
8749 w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
8750 w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
8751 w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
8752 w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
8753 w(0xb0), w(0x54), w(0xbb), w(0x16) \
8754 }
8755#define SSE2NEON_AES_RSBOX(w) \
8756 { \
8757 w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
8758 w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
8759 w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
8760 w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
8761 w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
8762 w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
8763 w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
8764 w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
8765 w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
8766 w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
8767 w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
8768 w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
8769 w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
8770 w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
8771 w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
8772 w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
8773 w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
8774 w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
8775 w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
8776 w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
8777 w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
8778 w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
8779 w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
8780 w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
8781 w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
8782 w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
8783 w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
8784 w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
8785 w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
8786 w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
8787 w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
8788 w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
8789 w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
8790 w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
8791 w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
8792 w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
8793 w(0x55), w(0x21), w(0x0c), w(0x7d) \
8794 }
8795/* clang-format on */
8796
8797/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
8798#define SSE2NEON_AES_H0(x) (x)
8799static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
8800static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
8801#undef SSE2NEON_AES_H0
8802
8803/* x_time function and matrix multiply function */
8804#if !defined(__aarch64__)
8805#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
8806#define SSE2NEON_MULTIPLY(x, y) \
8807 (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^ \
8808 ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^ \
8809 ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
8810 ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
8811#endif
8812
8813// In the absence of crypto extensions, implement aesenc using regular NEON
8814// intrinsics instead. See:
8815// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
8816// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
8817// for more information.
8818FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
8819{
8820#if defined(__aarch64__)
8821 static const uint8_t shift_rows[] = {
8822 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
8823 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
8824 };
8825 static const uint8_t ror32by8[] = {
8826 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8827 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8828 };
8829
8830 uint8x16_t v;
8831 uint8x16_t w = vreinterpretq_u8_m128i(a);
8832
8833 /* shift rows */
8834 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8835
8836 /* sub bytes */
8837 // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
8838 // look up each of the table. After each lookup, we load the next table
8839 // which locates at the next 64-bytes. In the meantime, the index in the
8840 // table would be smaller than it was, so the index parameters of
8841 // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
8842 v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
8843 // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
8844 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
8845 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
8846 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
8847
8848 /* mix columns */
8849 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8850 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8851 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8852
8853 /* add round key */
8854 return vreinterpretq_m128i_u8(w) ^ RoundKey;
8855
8856#else /* ARMv7-A implementation for a table-based AES */
8857#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
8858 (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
8859 ((uint32_t) (b1) << 8) | (uint32_t) (b0))
8860// multiplying 'x' by 2 in GF(2^8)
8861#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
8862// multiplying 'x' by 3 in GF(2^8)
8863#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
8864#define SSE2NEON_AES_U0(p) \
8865 SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
8866#define SSE2NEON_AES_U1(p) \
8867 SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
8868#define SSE2NEON_AES_U2(p) \
8869 SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
8870#define SSE2NEON_AES_U3(p) \
8871 SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
8872
8873 // this generates a table containing every possible permutation of
8874 // shift_rows() and sub_bytes() with mix_columns().
8875 static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
8880 };
8881#undef SSE2NEON_AES_B2W
8882#undef SSE2NEON_AES_F2
8883#undef SSE2NEON_AES_F3
8884#undef SSE2NEON_AES_U0
8885#undef SSE2NEON_AES_U1
8886#undef SSE2NEON_AES_U2
8887#undef SSE2NEON_AES_U3
8888
8889 uint32_t x0 = _mm_cvtsi128_si32(a); // get a[31:0]
8890 uint32_t x1 =
8891 _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); // get a[63:32]
8892 uint32_t x2 =
8893 _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA)); // get a[95:64]
8894 uint32_t x3 =
8895 _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF)); // get a[127:96]
8896
8897 // finish the modulo addition step in mix_columns()
8898 __m128i out = _mm_set_epi32(
8899 (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
8900 aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
8901 (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
8902 aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
8903 (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
8904 aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
8905 (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
8906 aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
8907
8908 return _mm_xor_si128(out, RoundKey);
8909#endif
8910}
8911
8912// Perform one round of an AES decryption flow on data (state) in a using the
8913// round key in RoundKey, and store the result in dst.
8914// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
8915FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
8916{
8917#if defined(__aarch64__)
8918 static const uint8_t inv_shift_rows[] = {
8919 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
8920 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
8921 };
8922 static const uint8_t ror32by8[] = {
8923 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8924 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8925 };
8926
8927 uint8x16_t v;
8928 uint8x16_t w = vreinterpretq_u8_m128i(a);
8929
8930 // inverse shift rows
8931 w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
8932
8933 // inverse sub bytes
8934 v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
8935 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
8936 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
8937 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
8938
8939 // inverse mix columns
8940 // multiplying 'v' by 4 in GF(2^8)
8941 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8942 w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
8943 v ^= w;
8944 v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
8945
8946 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
8947 0x1b); // multiplying 'v' by 2 in GF(2^8)
8948 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8949 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8950
8951 // add round key
8952 return vreinterpretq_m128i_u8(w) ^ RoundKey;
8953
8954#else /* ARMv7-A NEON implementation */
8955 /* FIXME: optimized for NEON */
8956 uint8_t i, e, f, g, h, v[4][4];
8957 uint8_t *_a = (uint8_t *) &a;
8958 for (i = 0; i < 16; ++i) {
8959 v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
8960 }
8961
8962 // inverse mix columns
8963 for (i = 0; i < 4; ++i) {
8964 e = v[i][0];
8965 f = v[i][1];
8966 g = v[i][2];
8967 h = v[i][3];
8968
8969 v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
8970 SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
8971 v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
8972 SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
8973 v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
8974 SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
8975 v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
8976 SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
8977 }
8978
8979 return _mm_xor_si128(vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)),
8980 RoundKey);
8981#endif
8982}
8983
8984// Perform the last round of an AES encryption flow on data (state) in a using
8985// the round key in RoundKey, and store the result in dst.
8986// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
8988{
8989#if defined(__aarch64__)
8990 static const uint8_t shift_rows[] = {
8991 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
8992 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
8993 };
8994
8995 uint8x16_t v;
8996 uint8x16_t w = vreinterpretq_u8_m128i(a);
8997
8998 // shift rows
8999 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
9000
9001 // sub bytes
9002 v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
9003 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
9004 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
9005 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
9006
9007 // add round key
9008 return vreinterpretq_m128i_u8(v) ^ RoundKey;
9009
9010#else /* ARMv7-A implementation */
9011 uint8_t v[16] = {
9012 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
9013 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
9014 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
9015 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
9016 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
9017 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
9018 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
9019 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
9020 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
9021 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
9022 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
9023 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
9024 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
9025 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
9026 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
9027 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
9028 };
9029
9030 return _mm_xor_si128(vreinterpretq_m128i_u8(vld1q_u8(v)), RoundKey);
9031#endif
9032}
9033
9034// Perform the last round of an AES decryption flow on data (state) in a using
9035// the round key in RoundKey, and store the result in dst.
9036// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
9038{
9039#if defined(__aarch64__)
9040 static const uint8_t inv_shift_rows[] = {
9041 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
9042 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
9043 };
9044
9045 uint8x16_t v;
9046 uint8x16_t w = vreinterpretq_u8_m128i(a);
9047
9048 // inverse shift rows
9049 w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
9050
9051 // inverse sub bytes
9052 v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
9053 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
9054 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
9055 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
9056
9057 // add round key
9058 return vreinterpretq_m128i_u8(v) ^ RoundKey;
9059
9060#else /* ARMv7-A NEON implementation */
9061 /* FIXME: optimized for NEON */
9062 uint8_t v[4][4];
9063 uint8_t *_a = (uint8_t *) &a;
9064 for (int i = 0; i < 16; ++i) {
9065 v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
9066 }
9067
9068 return _mm_xor_si128(vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)),
9069 RoundKey);
9070#endif
9071}
9072
9073// Perform the InvMixColumns transformation on a and store the result in dst.
9074// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
9076{
9077#if defined(__aarch64__)
9078 static const uint8_t ror32by8[] = {
9079 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
9080 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
9081 };
9082 uint8x16_t v = vreinterpretq_u8_m128i(a);
9083 uint8x16_t w;
9084
9085 // multiplying 'v' by 4 in GF(2^8)
9086 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
9087 w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
9088 v ^= w;
9089 v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
9090
9091 // multiplying 'v' by 2 in GF(2^8)
9092 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
9093 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
9094 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
9095 return vreinterpretq_m128i_u8(w);
9096
9097#else /* ARMv7-A NEON implementation */
9098 uint8_t i, e, f, g, h, v[4][4];
9099 vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
9100 for (i = 0; i < 4; ++i) {
9101 e = v[i][0];
9102 f = v[i][1];
9103 g = v[i][2];
9104 h = v[i][3];
9105
9106 v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
9107 SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
9108 v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
9109 SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
9110 v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
9111 SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
9112 v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
9113 SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
9114 }
9115
9116 return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
9117#endif
9118}
9119
9120// Assist in expanding the AES cipher key by computing steps towards generating
9121// a round key for encryption cipher using data from a and an 8-bit round
9122// constant specified in imm8, and store the result in dst.
9123// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
9124//
9125// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
9126// This instruction generates a round key for AES encryption. See
9127// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
9128// for details.
9129FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
9130{
9131#if defined(__aarch64__)
9132 uint8x16_t _a = vreinterpretq_u8_m128i(a);
9133 uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
9134 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
9135 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
9136 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
9137
9138 uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
9139 uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
9140 uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));
9141
9142 return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v));
9143
9144#else /* ARMv7-A NEON implementation */
9145 uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
9146 uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
9147 for (int i = 0; i < 4; ++i) {
9148 ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
9149 ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
9150 }
9151 return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
9152 ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
9153#endif
9154}
9155#undef SSE2NEON_AES_SBOX
9156#undef SSE2NEON_AES_RSBOX
9157
9158#if defined(__aarch64__)
9159#undef SSE2NEON_XT
9160#undef SSE2NEON_MULTIPLY
9161#endif
9162
9163#else /* __ARM_FEATURE_CRYPTO */
9164// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
9165// AESMC and then manually applying the real key as an xor operation. This
9166// unfortunately means an additional xor op; the compiler should be able to
9167// optimize this away for repeated calls however. See
9168// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
9169// for more details.
9170FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
9171{
9172 return vreinterpretq_m128i_u8(veorq_u8(
9173 vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
9175}
9176
9177// Perform one round of an AES decryption flow on data (state) in a using the
9178// round key in RoundKey, and store the result in dst.
9179// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
9180FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
9181{
9182 return vreinterpretq_m128i_u8(veorq_u8(
9183 vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
9184 vreinterpretq_u8_m128i(RoundKey)));
9185}
9186
9187// Perform the last round of an AES encryption flow on data (state) in a using
9188// the round key in RoundKey, and store the result in dst.
9189// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
9190FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
9191{
9192 return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
9193 vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
9194 RoundKey);
9195}
9196
9197// Perform the last round of an AES decryption flow on data (state) in a using
9198// the round key in RoundKey, and store the result in dst.
9199// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
9200FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
9201{
9203 veorq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)),
9204 vreinterpretq_u8_m128i(RoundKey)));
9205}
9206
9207// Perform the InvMixColumns transformation on a and store the result in dst.
9208// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
9209FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
9210{
9211 return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
9212}
9213
9214// Assist in expanding the AES cipher key by computing steps towards generating
9215// a round key for encryption cipher using data from a and an 8-bit round
9216// constant specified in imm8, and store the result in dst."
9217// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
9218FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
9219{
9220 // AESE does ShiftRows and SubBytes on A
9221 uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
9222
9223#if !defined(_MSC_VER) || defined(__clang__)
9224 uint8x16_t dest = {
9225 // Undo ShiftRows step from AESE and extract X1 and X3
9226 u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
9227 u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
9228 u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
9229 u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
9230 };
9231 uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
9233#else
9234 // We have to do this hack because MSVC is strictly adhering to the CPP
9235 // standard, in particular C++03 8.5.1 sub-section 15, which states that
9236 // unions must be initialized by their first member type.
9237
9238 // As per the Windows ARM64 ABI, it is always little endian, so this works
9239 __n128 dest{
9240 ((uint64_t) u8.n128_u8[0x4] << 0) | ((uint64_t) u8.n128_u8[0x1] << 8) |
9241 ((uint64_t) u8.n128_u8[0xE] << 16) |
9242 ((uint64_t) u8.n128_u8[0xB] << 24) |
9243 ((uint64_t) u8.n128_u8[0x1] << 32) |
9244 ((uint64_t) u8.n128_u8[0xE] << 40) |
9245 ((uint64_t) u8.n128_u8[0xB] << 48) |
9246 ((uint64_t) u8.n128_u8[0x4] << 56),
9247 ((uint64_t) u8.n128_u8[0xC] << 0) | ((uint64_t) u8.n128_u8[0x9] << 8) |
9248 ((uint64_t) u8.n128_u8[0x6] << 16) |
9249 ((uint64_t) u8.n128_u8[0x3] << 24) |
9250 ((uint64_t) u8.n128_u8[0x9] << 32) |
9251 ((uint64_t) u8.n128_u8[0x6] << 40) |
9252 ((uint64_t) u8.n128_u8[0x3] << 48) |
9253 ((uint64_t) u8.n128_u8[0xC] << 56)};
9254
9255 dest.n128_u32[1] = dest.n128_u32[1] ^ rcon;
9256 dest.n128_u32[3] = dest.n128_u32[3] ^ rcon;
9257
9258 return dest;
9259#endif
9260}
9261#endif
9262
9263/* Others */
9264
9265// Perform a carry-less multiplication of two 64-bit integers, selected from a
9266// and b according to imm8, and store the results in dst.
9267// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
9268FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
9269{
9270 uint64x2_t a = vreinterpretq_u64_m128i(_a);
9271 uint64x2_t b = vreinterpretq_u64_m128i(_b);
9272 switch (imm & 0x11) {
9273 case 0x00:
9275 _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
9276 case 0x01:
9278 _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
9279 case 0x10:
9281 _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
9282 case 0x11:
9284 _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
9285 default:
9286 abort();
9287 }
9288}
9289
9290FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
9291{
9292 union {
9293 fpcr_bitfield field;
9294#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
9295 uint64_t value;
9296#else
9297 uint32_t value;
9298#endif
9299 } r;
9300
9301#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
9302 r.value = _sse2neon_get_fpcr();
9303#else
9304 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
9305#endif
9306
9307 return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
9308}
9309
9310// Count the number of bits set to 1 in unsigned 32-bit integer a, and
9311// return that count in dst.
9312// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
9313FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
9314{
9315#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
9316#if __has_builtin(__builtin_popcount)
9317 return __builtin_popcount(a);
9318#elif defined(_MSC_VER)
9319 return _CountOneBits(a);
9320#else
9321 return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
9322#endif
9323#else
9324 uint32_t count = 0;
9325 uint8x8_t input_val, count8x8_val;
9326 uint16x4_t count16x4_val;
9327 uint32x2_t count32x2_val;
9328
9329 input_val = vld1_u8((uint8_t *) &a);
9330 count8x8_val = vcnt_u8(input_val);
9331 count16x4_val = vpaddl_u8(count8x8_val);
9332 count32x2_val = vpaddl_u16(count16x4_val);
9333
9334 vst1_u32(&count, count32x2_val);
9335 return count;
9336#endif
9337}
9338
9339// Count the number of bits set to 1 in unsigned 64-bit integer a, and
9340// return that count in dst.
9341// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
9342FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
9343{
9344#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
9345#if __has_builtin(__builtin_popcountll)
9346 return __builtin_popcountll(a);
9347#elif defined(_MSC_VER)
9348 return _CountOneBits64(a);
9349#else
9350 return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
9351#endif
9352#else
9353 uint64_t count = 0;
9354 uint8x8_t input_val, count8x8_val;
9355 uint16x4_t count16x4_val;
9356 uint32x2_t count32x2_val;
9357 uint64x1_t count64x1_val;
9358
9359 input_val = vld1_u8((uint8_t *) &a);
9360 count8x8_val = vcnt_u8(input_val);
9361 count16x4_val = vpaddl_u8(count8x8_val);
9362 count32x2_val = vpaddl_u16(count16x4_val);
9363 count64x1_val = vpaddl_u32(count32x2_val);
9364 vst1_u64(&count, count64x1_val);
9365 return count;
9366#endif
9367}
9368
9369FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
9370{
9371 // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
9372 // regardless of the value of the FZ bit.
9373 union {
9374 fpcr_bitfield field;
9375#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
9376 uint64_t value;
9377#else
9378 uint32_t value;
9379#endif
9380 } r;
9381
9382#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
9383 r.value = _sse2neon_get_fpcr();
9384#else
9385 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
9386#endif
9387
9388 r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
9389
9390#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
9391 _sse2neon_set_fpcr(r.value);
9392#else
9393 __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
9394#endif
9395}
9396
9397// Return the current 64-bit value of the processor's time-stamp counter.
9398// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
9399FORCE_INLINE uint64_t _rdtsc(void)
9400{
9401#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
9402 uint64_t val;
9403
9404 /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
9405 * system counter is at least 56 bits wide; from Armv8.6, the counter
9406 * must be 64 bits wide. So the system counter could be less than 64
9407 * bits wide and it is attributed with the flag 'cap_user_time_short'
9408 * is true.
9409 */
9410#if defined(_MSC_VER) && !defined(__clang__)
9411 val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2));
9412#else
9413 __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
9414#endif
9415
9416 return val;
9417#else
9418 uint32_t pmccntr, pmuseren, pmcntenset;
9419 // Read the user mode Performance Monitoring Unit (PMU)
9420 // User Enable Register (PMUSERENR) access permissions.
9421 __asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
9422 if (pmuseren & 1) { // Allows reading PMUSERENR for user mode code.
9423 __asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
9424 if (pmcntenset & 0x80000000UL) { // Is it counting?
9425 __asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
9426 // The counter is set up to count every 64th cycle
9427 return (uint64_t) (pmccntr) << 6;
9428 }
9429 }
9430
9431 // Fallback to syscall as we can't enable PMUSERENR in user mode.
9432 struct timeval tv;
9433 gettimeofday(&tv, NULL);
9434 return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
9435#endif
9436}
9437
9438#if defined(__GNUC__) || defined(__clang__)
9439#pragma pop_macro("ALIGN_STRUCT")
9440#pragma pop_macro("FORCE_INLINE")
9441#endif
9442
9443#if defined(__GNUC__) && !defined(__clang__)
9444#pragma GCC pop_options
9445#endif
9446
9447#endif
FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
Definition sse2neon.h:3787
FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
Definition sse2neon.h:2949
FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
Definition sse2neon.h:1423
FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
Definition sse2neon.h:2428
FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
Definition sse2neon.h:1490
FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
Definition sse2neon.h:1387
FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
Definition sse2neon.h:6161
FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
Definition sse2neon.h:6217
#define vreinterpret_m64_f32(x)
Definition sse2neon.h:496
FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
Definition sse2neon.h:6950
#define SSE2NEON_CMPSTR_GENERATE_MASK(dst)
Definition sse2neon.h:8362
FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
Definition sse2neon.h:678
#define _MM_FROUND_TO_POS_INF
Definition sse2neon.h:372
#define vreinterpretq_u32_m128d(x)
Definition sse2neon.h:538
#define vreinterpret_m64_s32(x)
Definition sse2neon.h:487
FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
Definition sse2neon.h:5845
FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
Definition sse2neon.h:5110
#define SSE2NEON_CACHELINE_SIZE
Definition sse2neon.h:288
FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
Definition sse2neon.h:1585
FORCE_INLINE unsigned int _mm_getcsr(void)
Definition sse2neon.h:2530
FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
Definition sse2neon.h:4899
#define vreinterpretq_m128_s32(x)
Definition sse2neon.h:445
FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
Definition sse2neon.h:7332
FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
Definition sse2neon.h:4158
FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
Definition sse2neon.h:3341
FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
Definition sse2neon.h:5333
FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
Definition sse2neon.h:1242
FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
Definition sse2neon.h:4567
FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
Definition sse2neon.h:2055
FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
Definition sse2neon.h:3250
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition sse2neon.h:2835
FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
Definition sse2neon.h:4295
FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
Definition sse2neon.h:3080
FORCE_INLINE __m128d _mm_setzero_pd(void)
Definition sse2neon.h:5152
#define SSE2NEON_AES_U2(p)
FORCE_INLINE __m128i _mm_set1_epi16(short w)
Definition sse2neon.h:5039
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition sse2neon.h:5578
FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
Definition sse2neon.h:1633
#define vreinterpretq_m128_f32(x)
Definition sse2neon.h:435
float32x4_t __m128
Definition sse2neon.h:406
FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
Definition sse2neon.h:4043
FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
Definition sse2neon.h:3053
FORCE_INLINE int _mm_cmpestrs(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition sse2neon.h:8456
FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
Definition sse2neon.h:1970
FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
Definition sse2neon.h:2975
FORCE_INLINE void _mm_sfence(void)
Definition sse2neon.h:2582
FORCE_INLINE __m128 _mm_load_ss(const float *p)
Definition sse2neon.h:1914
FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
Definition sse2neon.h:3268
FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
Definition sse2neon.h:2822
FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
Definition sse2neon.h:3108
#define SSE2NEON_MULTIPLY(x, y)
Definition sse2neon.h:8806
#define SSE2NEON_GENERATE_CMP_RANGES(prefix)
Definition sse2neon.h:8118
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition sse2neon.h:4982
FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
Definition sse2neon.h:6040
FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
Definition sse2neon.h:7529
#define vreinterpretq_m128i_s8(x)
Definition sse2neon.h:462
FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
Definition sse2neon.h:2799
FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
Definition sse2neon.h:8987
FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
Definition sse2neon.h:6456
FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
Definition sse2neon.h:5972
FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
Definition sse2neon.h:1052
FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
Definition sse2neon.h:3071
#define vreinterpret_m64_s8(x)
Definition sse2neon.h:485
FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
Definition sse2neon.h:4130
FORCE_INLINE int _mm_cmpestri(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition sse2neon.h:8420
FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
Definition sse2neon.h:6306
#define _SIDD_MASKED_NEGATIVE_POLARITY
Definition sse2neon.h:7829
#define vreinterpret_u16_m64(x)
Definition sse2neon.h:500
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition sse2neon.h:6078
#define vreinterpretq_m128i_u64(x)
Definition sse2neon.h:470
FORCE_INLINE int _mm_cmpestrz(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition sse2neon.h:8472
FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int)
Definition sse2neon.h:4959
FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
Definition sse2neon.h:3716
FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
Definition sse2neon.h:637
FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
Definition sse2neon.h:5941
FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
Definition sse2neon.h:4029
FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
Definition sse2neon.h:6441
FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
Definition sse2neon.h:760
#define SSE2NEON_CMPSTR_SET_UPPER(var, imm)
Definition sse2neon.h:8335
uint16_t(* cmpestr_func_t)(__m128i a, int la, __m128i b, int lb)
Definition sse2neon.h:8264
FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
Definition sse2neon.h:4585
#define vreinterpret_m64_s64(x)
Definition sse2neon.h:488
FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
Definition sse2neon.h:8542
#define vreinterpretq_u8_m128i(x)
Definition sse2neon.h:480
FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
Definition sse2neon.h:5449
FORCE_INLINE __m128d _mm_ceil_pd(__m128d)
Definition sse2neon.h:6901
FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
Definition sse2neon.h:2741
#define vreinterpretq_s16_m128i(x)
Definition sse2neon.h:476
#define _MM_FLUSH_ZERO_MASK
Definition sse2neon.h:388
FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
Definition sse2neon.h:4817
#define vreinterpretq_u64_m128d(x)
Definition sse2neon.h:539
FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
Definition sse2neon.h:1368
FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
Definition sse2neon.h:6553
#define vreinterpretq_s8_m128(x)
Definition sse2neon.h:457
FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
Definition sse2neon.h:4754
FORCE_INLINE void _sse2neon_smp_mb(void)
Definition sse2neon.h:224
FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
Definition sse2neon.h:1251
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition sse2neon.h:2967
FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
Definition sse2neon.h:7549
FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
Definition sse2neon.h:2538
FORCE_INLINE __m128d _mm_load_pd(const double *p)
Definition sse2neon.h:4327
#define vreinterpretq_m128_u64(x)
Definition sse2neon.h:441
FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
Definition sse2neon.h:3500
FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
Definition sse2neon.h:3332
FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
Definition sse2neon.h:5861
#define vreinterpret_m64_u8(x)
Definition sse2neon.h:490
#define vreinterpretq_m128d_s32(x)
Definition sse2neon.h:528
FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
Definition sse2neon.h:2010
FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
Definition sse2neon.h:4799
FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
Definition sse2neon.h:2217
FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
Definition sse2neon.h:5400
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition sse2neon.h:1769
FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
Definition sse2neon.h:1500
FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
Definition sse2neon.h:2781
FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
Definition sse2neon.h:6689
FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
Definition sse2neon.h:3376
FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
Definition sse2neon.h:4424
FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
Definition sse2neon.h:8617
FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
Definition sse2neon.h:6747
FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
Definition sse2neon.h:4408
FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
Definition sse2neon.h:4437
FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
Definition sse2neon.h:777
FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
Definition sse2neon.h:3223
FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
Definition sse2neon.h:8598
FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
Definition sse2neon.h:5802
FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
Definition sse2neon.h:5145
FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
Definition sse2neon.h:8572
#define vreinterpretq_f32_m128i(x)
Definition sse2neon.h:472
FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
Definition sse2neon.h:1061
FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
Definition sse2neon.h:660
FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
Definition sse2neon.h:5877
FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
Definition sse2neon.h:5723
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition sse2neon.h:4445
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition sse2neon.h:3973
FORCE_INLINE int _mm_movemask_epi8(__m128i a)
Definition sse2neon.h:4653
FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
Definition sse2neon.h:5462
FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
Definition sse2neon.h:1560
#define vreinterpret_m64_u16(x)
Definition sse2neon.h:491
FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
Definition sse2neon.h:1550
FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
Definition sse2neon.h:1758
FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
Definition sse2neon.h:2416
FORCE_INLINE int _mm_test_all_ones(__m128i a)
Definition sse2neon.h:7731
FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i)
Definition sse2neon.h:3232
#define _MM_DENORMALS_ZERO_OFF
Definition sse2neon.h:394
FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
Definition sse2neon.h:3302
FORCE_INLINE __m128i _mm_setzero_si128(void)
Definition sse2neon.h:5163
FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
Definition sse2neon.h:3871
FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
Definition sse2neon.h:6356
FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
Definition sse2neon.h:6225
FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
Definition sse2neon.h:2806
FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
Definition sse2neon.h:6856
FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
Definition sse2neon.h:7690
FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
Definition sse2neon.h:3017
#define vreinterpretq_m128_u32(x)
Definition sse2neon.h:440
FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
Definition sse2neon.h:5592
FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
Definition sse2neon.h:2459
FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d)
Definition sse2neon.h:4643
FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
Definition sse2neon.h:3850
FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
Definition sse2neon.h:1270
FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
Definition sse2neon.h:3902
FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
Definition sse2neon.h:1278
#define SSE2NEON_AES_U0(p)
#define _sse2neon_const
Definition sse2neon.h:123
#define vreinterpretq_s32_m128i(x)
Definition sse2neon.h:477
#define vreinterpret_s64_m64(x)
Definition sse2neon.h:507
FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
Definition sse2neon.h:7796
FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
Definition sse2neon.h:6209
#define vreinterpretq_m128i_s16(x)
Definition sse2neon.h:463
FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
Definition sse2neon.h:4807
#define vreinterpretq_f32_m128d(x)
Definition sse2neon.h:541
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition sse2neon.h:3089
FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
Definition sse2neon.h:3062
FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
Definition sse2neon.h:1660
FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
Definition sse2neon.h:7151
FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
Definition sse2neon.h:2720
#define vreinterpretq_m128d_s64(x)
Definition sse2neon.h:529
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition sse2neon.h:5046
FORCE_INLINE void _mm_setcsr(unsigned int a)
Definition sse2neon.h:2522
FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
Definition sse2neon.h:1042
float32x4_t __m128d
Definition sse2neon.h:413
FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
Definition sse2neon.h:2917
#define SSE2NEON_AES_RSBOX(w)
Definition sse2neon.h:8755
FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
Definition sse2neon.h:1108
FORCE_INLINE uint16_t _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
Definition sse2neon.h:8271
FORCE_INLINE int _mm_cmpestrc(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition sse2neon.h:8407
FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
Definition sse2neon.h:2347
#define vreinterpretq_s8_m128i(x)
Definition sse2neon.h:475
FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
Definition sse2neon.h:4509
FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
Definition sse2neon.h:4452
FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
Definition sse2neon.h:5697
FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
Definition sse2neon.h:770
FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
Definition sse2neon.h:3573
FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
Definition sse2neon.h:802
FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
Definition sse2neon.h:3956
FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
Definition sse2neon.h:4462
#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix)
Definition sse2neon.h:8234
FORCE_INLINE __m128d _mm_floor_pd(__m128d)
Definition sse2neon.h:7223
FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
Definition sse2neon.h:1405
FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
Definition sse2neon.h:6933
FORCE_INLINE uint64_t _rdtsc(void)
Definition sse2neon.h:9399
FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
Definition sse2neon.h:2959
FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
Definition sse2neon.h:3474
FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
Definition sse2neon.h:4240
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition sse2neon.h:2792
FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
Definition sse2neon.h:6426
FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
Definition sse2neon.h:3686
#define vreinterpretq_s64_m128d(x)
Definition sse2neon.h:536
FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
Definition sse2neon.h:5102
FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
Definition sse2neon.h:7059
FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
Definition sse2neon.h:6180
FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
Definition sse2neon.h:1077
FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
Definition sse2neon.h:4871
FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
Definition sse2neon.h:7069
FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
Definition sse2neon.h:6776
FORCE_INLINE __m128d _mm_load1_pd(const double *p)
Definition sse2neon.h:4369
#define _MM_DENORMALS_ZERO_ON
Definition sse2neon.h:393
FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
Definition sse2neon.h:2983
FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
Definition sse2neon.h:1731
FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
Definition sse2neon.h:818
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition sse2neon.h:2207
FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
Definition sse2neon.h:5067
#define vreinterpretq_m128i_u8(x)
Definition sse2neon.h:467
FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
Definition sse2neon.h:1520
FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
Definition sse2neon.h:4232
FORCE_INLINE __m128d _mm_round_pd(__m128d, int)
Definition sse2neon.h:7560
FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
Definition sse2neon.h:1601
#define _MM_ROUND_NEAREST
Definition sse2neon.h:383
FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
Definition sse2neon.h:1941
FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
Definition sse2neon.h:795
FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
Definition sse2neon.h:7740
FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
Definition sse2neon.h:9268
#define vreinterpret_u32_m64(x)
Definition sse2neon.h:501
FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
Definition sse2neon.h:4746
#define vreinterpretq_nth_u64_m128i(x, n)
Definition sse2neon.h:580
FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
Definition sse2neon.h:1469
FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
Definition sse2neon.h:3520
FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
Definition sse2neon.h:7368
FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
Definition sse2neon.h:7386
FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
Definition sse2neon.h:1611
#define vreinterpretq_m128d_f32(x)
Definition sse2neon.h:534
FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
Definition sse2neon.h:3651
FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
Definition sse2neon.h:4072
#define vreinterpretq_f64_m128i(x)
Definition sse2neon.h:473
FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
Definition sse2neon.h:1314
FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
Definition sse2neon.h:5910
FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
Definition sse2neon.h:4209
#define vreinterpret_m64_s16(x)
Definition sse2neon.h:486
FORCE_INLINE int _mm_movemask_pd(__m128d a)
Definition sse2neon.h:4736
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition sse2neon.h:2468
FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
Definition sse2neon.h:1841
#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)
Definition sse2neon.h:8357
FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
Definition sse2neon.h:3643
FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
Definition sse2neon.h:7350
FORCE_INLINE __m128d _mm_hsub_pd(__m128d a, __m128d b)
Definition sse2neon.h:6096
#define _SIDD_NEGATIVE_POLARITY
Definition sse2neon.h:7828
#define _MM_ROUND_DOWN
Definition sse2neon.h:384
FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
Definition sse2neon.h:7359
FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
Definition sse2neon.h:866
FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
Definition sse2neon.h:5053
FORCE_INLINE __m128 _mm_undefined_ps(void)
Definition sse2neon.h:2898
FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
Definition sse2neon.h:3767
FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
Definition sse2neon.h:6056
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition sse2neon.h:2513
FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
Definition sse2neon.h:5344
FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
Definition sse2neon.h:2846
FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
Definition sse2neon.h:4909
FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
Definition sse2neon.h:6988
FORCE_INLINE double sse2neon_recast_u64_f64(uint64_t val)
Definition sse2neon.h:131
FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
Definition sse2neon.h:5424
FORCE_INLINE __m128d _mm_set_sd(double a)
Definition sse2neon.h:5028
FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
Definition sse2neon.h:3924
FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
Definition sse2neon.h:6943
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition sse2neon.h:6589
FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
Definition sse2neon.h:6241
FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
Definition sse2neon.h:6024
FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
Definition sse2neon.h:1397
FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
Definition sse2neon.h:3834
FORCE_INLINE __m128i _mm_undefined_si128(void)
Definition sse2neon.h:2880
FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
Definition sse2neon.h:3818
FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
Definition sse2neon.h:4518
FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
Definition sse2neon.h:825
FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
Definition sse2neon.h:2103
FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
Definition sse2neon.h:3178
FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
Definition sse2neon.h:4774
#define _MM_FROUND_TO_NEG_INF
Definition sse2neon.h:371
FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
Definition sse2neon.h:7093
FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
Definition sse2neon.h:7754
FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
Definition sse2neon.h:5611
FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
Definition sse2neon.h:2772
FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
Definition sse2neon.h:9313
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition sse2neon.h:1296
FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
Definition sse2neon.h:1679
FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
Definition sse2neon.h:2067
FORCE_INLINE void _mm_empty(void)
Definition sse2neon.h:1195
FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
Definition sse2neon.h:5714
#define vreinterpretq_u64_m128(x)
Definition sse2neon.h:455
FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
Definition sse2neon.h:5650
FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
Definition sse2neon.h:5892
FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
Definition sse2neon.h:5675
FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t)
Definition sse2neon.h:8654
FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
Definition sse2neon.h:1360
FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
Definition sse2neon.h:7007
FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
Definition sse2neon.h:3581
FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
Definition sse2neon.h:5634
FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
Definition sse2neon.h:8314
FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
Definition sse2neon.h:1433
FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
Definition sse2neon.h:1716
FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
Definition sse2neon.h:1482
FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
Definition sse2neon.h:4889
FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
Definition sse2neon.h:3035
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition sse2neon.h:1961
FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
Definition sse2neon.h:6322
FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
Definition sse2neon.h:1530
FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
Definition sse2neon.h:4117
FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
Definition sse2neon.h:6012
FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
Definition sse2neon.h:6489
FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
Definition sse2neon.h:832
FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
Definition sse2neon.h:3241
#define vreinterpretq_s32_m128(x)
Definition sse2neon.h:459
FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
Definition sse2neon.h:2750
FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
Definition sse2neon.h:6577
FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
Definition sse2neon.h:1415
FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
Definition sse2neon.h:6472
FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
Definition sse2neon.h:3154
FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
Definition sse2neon.h:3616
FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
Definition sse2neon.h:5296
FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
Definition sse2neon.h:6660
FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
Definition sse2neon.h:645
FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
Definition sse2neon.h:6968
#define vreinterpretq_u32_m128i(x)
Definition sse2neon.h:482
FORCE_INLINE void * _mm_malloc(size_t size, size_t align)
Definition sse2neon.h:1989
FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
Definition sse2neon.h:6379
FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, signed char b1, signed char b2, signed char b3, signed char b4, signed char b5, signed char b6, signed char b7, signed char b8, signed char b9, signed char b10, signed char b11, signed char b12, signed char b13, signed char b14, signed char b15)
Definition sse2neon.h:5117
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition sse2neon.h:4361
FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
Definition sse2neon.h:1232
FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
Definition sse2neon.h:2153
FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
Definition sse2neon.h:3736
FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
Definition sse2neon.h:5705
FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
Definition sse2neon.h:7018
FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
Definition sse2neon.h:1115
FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
Definition sse2neon.h:2406
#define vreinterpretq_m128i_s64(x)
Definition sse2neon.h:465
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition sse2neon.h:2546
FORCE_INLINE int _sse2neon_clz(unsigned int x)
Definition sse2neon.h:8290
#define vreinterpretq_m128_s64(x)
Definition sse2neon.h:446
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition sse2neon.h:5642
FORCE_INLINE void _mm_free(void *addr)
Definition sse2neon.h:1807
#define vreinterpretq_u32_m128(x)
Definition sse2neon.h:454
#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)
Definition sse2neon.h:8351
FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
Definition sse2neon.h:4269
FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
Definition sse2neon.h:1951
FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
Definition sse2neon.h:4936
FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
Definition sse2neon.h:3802
FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
Definition sse2neon.h:7080
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition sse2neon.h:1223
FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
Definition sse2neon.h:9129
FORCE_INLINE int64_t sse2neon_recast_f64_s64(double val)
Definition sse2neon.h:137
FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
Definition sse2neon.h:3455
FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
Definition sse2neon.h:3397
FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
Definition sse2neon.h:6394
FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
Definition sse2neon.h:1540
FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
Definition sse2neon.h:5060
FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
Definition sse2neon.h:4633
#define _MM_ROUND_UP
Definition sse2neon.h:385
FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
Definition sse2neon.h:5322
FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
Definition sse2neon.h:1790
FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
Definition sse2neon.h:8524
int64x1_t __m64
Definition sse2neon.h:405
FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
Definition sse2neon.h:7377
FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
Definition sse2neon.h:4576
FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
Definition sse2neon.h:5412
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
Definition sse2neon.h:3190
FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
Definition sse2neon.h:3465
#define vreinterpret_s32_m64(x)
Definition sse2neon.h:506
FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
Definition sse2neon.h:8560
FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
Definition sse2neon.h:6619
#define vreinterpretq_m128d_u32(x)
Definition sse2neon.h:531
FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
Definition sse2neon.h:3098
FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
Definition sse2neon.h:4194
#define vreinterpret_m64_u64(x)
Definition sse2neon.h:493
FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
Definition sse2neon.h:2029
#define __int64
Definition sse2neon.h:428
FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
Definition sse2neon.h:848
FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
Definition sse2neon.h:3323
FORCE_INLINE __m128 _mm_floor_ps(__m128)
Definition sse2neon.h:7239
FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
Definition sse2neon.h:3350
#define vreinterpretq_u64_m128i(x)
Definition sse2neon.h:483
FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
Definition sse2neon.h:4173
FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
Definition sse2neon.h:7030
FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
Definition sse2neon.h:4395
FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
Definition sse2neon.h:9369
FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
Definition sse2neon.h:6887
FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t)
Definition sse2neon.h:4974
#define _MM_ROUND_TOWARD_ZERO
Definition sse2neon.h:386
FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
Definition sse2neon.h:809
FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
Definition sse2neon.h:1260
FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
Definition sse2neon.h:9075
FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void)
Definition sse2neon.h:1865
FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
Definition sse2neon.h:5775
FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
Definition sse2neon.h:4487
FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
Definition sse2neon.h:1342
FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
Definition sse2neon.h:7323
FORCE_INLINE int _mm_cmpestra(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition sse2neon.h:8393
#define SSE2NEON_CMPESTR_LIST
Definition sse2neon.h:8241
FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
Definition sse2neon.h:5793
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition sse2neon.h:1202
FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
Definition sse2neon.h:1828
FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
Definition sse2neon.h:1461
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition sse2neon.h:1332
FORCE_INLINE int _mm_movemask_ps(__m128 a)
Definition sse2neon.h:2184
FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
Definition sse2neon.h:7255
FORCE_INLINE __m128i _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
Definition sse2neon.h:5087
FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i)
Definition sse2neon.h:4880
FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
Definition sse2neon.h:3538
FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
Definition sse2neon.h:9290
FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
Definition sse2neon.h:9037
FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
Definition sse2neon.h:6997
FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
Definition sse2neon.h:7050
#define _mm_set_pd1
Definition sse2neon.h:5023
FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
Definition sse2neon.h:4500
FORCE_INLINE void _mm_stream_si32(int *p, int a)
Definition sse2neon.h:5688
FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
Definition sse2neon.h:7341
FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
Definition sse2neon.h:6410
FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
Definition sse2neon.h:3678
FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
Definition sse2neon.h:8915
FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
Definition sse2neon.h:3044
FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
Definition sse2neon.h:4057
FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
Definition sse2neon.h:5603
FORCE_INLINE __m128d _mm_set1_pd(double d)
Definition sse2neon.h:5075
#define _MM_FROUND_TO_NEAREST_INT
Definition sse2neon.h:370
FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
Definition sse2neon.h:2933
FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
Definition sse2neon.h:1695
#define vreinterpretq_m128d_u64(x)
Definition sse2neon.h:532
FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
Definition sse2neon.h:4149
FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
Definition sse2neon.h:1324
FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
Definition sse2neon.h:5742
FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
Definition sse2neon.h:8636
#define SSE2NEON_AES_U3(p)
#define vreinterpret_s16_m64(x)
Definition sse2neon.h:505
FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
Definition sse2neon.h:6201
FORCE_INLINE __m128i _mm_set_epi16(short i7, short i6, short i5, short i4, short i3, short i2, short i1, short i0)
Definition sse2neon.h:4944
FORCE_INLINE __m128d _mm_load_sd(const double *p)
Definition sse2neon.h:4347
FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
Definition sse2neon.h:1379
FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
Definition sse2neon.h:3940
#define vreinterpretq_m128i_u32(x)
Definition sse2neon.h:469
int16_t ALIGN_STRUCT(1) unaligned_int16_t
Definition sse2neon.h:418
FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
Definition sse2neon.h:5811
FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
Definition sse2neon.h:2077
FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
Definition sse2neon.h:1095
FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
Definition sse2neon.h:8533
FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b)
Definition sse2neon.h:2137
#define SSE2NEON_AES_U1(p)
FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
Definition sse2neon.h:5768
FORCE_INLINE __m128i _mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition sse2neon.h:8434
FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
Definition sse2neon.h:3118
FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
Definition sse2neon.h:1069
FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
Definition sse2neon.h:1212
FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
Definition sse2neon.h:2814
FORCE_INLINE __m128 _mm_ceil_ps(__m128)
Definition sse2neon.h:6917
FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
Definition sse2neon.h:7445
#define vreinterpretq_m128i_s32(x)
Definition sse2neon.h:464
FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
Definition sse2neon.h:7540
FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
Definition sse2neon.h:6842
FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
Definition sse2neon.h:8818
FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
Definition sse2neon.h:786
FORCE_INLINE __m128 _mm_set_ss(float a)
Definition sse2neon.h:2505
FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
Definition sse2neon.h:884
FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
Definition sse2neon.h:4862
FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
Definition sse2neon.h:875
FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
Definition sse2neon.h:9342
FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
Definition sse2neon.h:5784
FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
Definition sse2neon.h:7395
#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix)
Definition sse2neon.h:8058
FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
Definition sse2neon.h:3879
FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
Definition sse2neon.h:5385
FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
Definition sse2neon.h:8551
#define SSE2NEON_BARRIER()
Definition sse2neon.h:208
FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
Definition sse2neon.h:1931
FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
Definition sse2neon.h:1306
FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
Definition sse2neon.h:2363
FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
Definition sse2neon.h:3146
FORCE_INLINE __m128 _mm_move_ss(__m128, __m128)
Definition sse2neon.h:2126
FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
Definition sse2neon.h:4220
FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
Definition sse2neon.h:6632
FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
Definition sse2neon.h:3445
#define vreinterpretq_f32_m128(x)
Definition sse2neon.h:449
FORCE_INLINE __m128 _mm_round_ps(__m128, int)
Definition sse2neon.h:7629
FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
Definition sse2neon.h:840
FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
Definition sse2neon.h:4763
FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
Definition sse2neon.h:3546
FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
Definition sse2neon.h:2226
FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
Definition sse2neon.h:7265
FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
Definition sse2neon.h:4967
FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
Definition sse2neon.h:5537
FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
Definition sse2neon.h:4838
FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
Definition sse2neon.h:1350
FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
Definition sse2neon.h:5624
FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
Definition sse2neon.h:8515
FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
Definition sse2neon.h:1085
FORCE_INLINE __m128d _mm_undefined_pd(void)
Definition sse2neon.h:5826
#define _MM_FROUND_TO_ZERO
Definition sse2neon.h:373
FORCE_INLINE __m128d _mm_set_pd(double, double)
Definition sse2neon.h:5010
#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix)
Definition sse2neon.h:8223
FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
Definition sse2neon.h:3162
FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
Definition sse2neon.h:858
FORCE_INLINE __m128 _mm_load1_ps(const float *p)
Definition sse2neon.h:1922
FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
Definition sse2neon.h:7039
FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
Definition sse2neon.h:6977
FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
Definition sse2neon.h:6120
FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
Definition sse2neon.h:1624
#define _MM_FLUSH_ZERO_ON
Definition sse2neon.h:389
FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
Definition sse2neon.h:8583
FORCE_INLINE void _mm_prefetch(char const *p, int i)
Definition sse2neon.h:2295
#define _MM_DENORMALS_ZERO_MASK
Definition sse2neon.h:392
FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
Definition sse2neon.h:1669
FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
Definition sse2neon.h:7776
FORCE_INLINE __m128i _mm_castps_si128(__m128)
Definition sse2neon.h:3170
FORCE_INLINE void _mm_lfence(void)
Definition sse2neon.h:2603
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition sse2neon.h:1894
FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
Definition sse2neon.h:4616
#define vreinterpretq_m128i_u16(x)
Definition sse2neon.h:468
FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
Definition sse2neon.h:1644
_mm_hint
Definition sse2neon.h:738
@ _MM_HINT_T1
Definition sse2neon.h:741
@ _MM_HINT_T0
Definition sse2neon.h:740
@ _MM_HINT_T2
Definition sse2neon.h:742
@ _MM_HINT_NTA
Definition sse2neon.h:739
#define _MM_FROUND_CUR_DIRECTION
Definition sse2neon.h:374
int64x2_t __m128i
Definition sse2neon.h:415
FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
Definition sse2neon.h:3127
FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
Definition sse2neon.h:7710
#define SSE2NEON_AES_SBOX(w)
Definition sse2neon.h:8715
FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
Definition sse2neon.h:2372
FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
Definition sse2neon.h:5566
#define vreinterpretq_u16_m128i(x)
Definition sse2neon.h:481
FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
Definition sse2neon.h:4094
FORCE_INLINE int _mm_movemask_pi8(__m64 a)
Definition sse2neon.h:2163
FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
Definition sse2neon.h:1101
FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
Definition sse2neon.h:1740
FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
Definition sse2neon.h:3276
FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
Definition sse2neon.h:5660
FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
Definition sse2neon.h:2992
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition sse2neon.h:2732
FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
Definition sse2neon.h:4250
FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
Definition sse2neon.h:5957
FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
Definition sse2neon.h:5367
FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
Definition sse2neon.h:6338
FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
Definition sse2neon.h:2478
FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
Definition sse2neon.h:4382
FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
Definition sse2neon.h:5732
FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
Definition sse2neon.h:5309
#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8)
Definition sse2neon.h:8485
FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
Definition sse2neon.h:4550
FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
Definition sse2neon.h:1576
#define vreinterpret_s8_m64(x)
Definition sse2neon.h:504
FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
Definition sse2neon.h:1817
#define _mm_srli_si128(a, imm)
Definition sse2neon.h:5525
#define _mm_shuffle_epi32(a, imm)
Definition sse2neon.h:5183
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition sse2neon.h:2687
FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
Definition sse2neon.h:5283
#define vreinterpret_u8_m64(x)
Definition sse2neon.h:499
FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
Definition sse2neon.h:2764
FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
Definition sse2neon.h:5925
FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
Definition sse2neon.h:1510
FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
Definition sse2neon.h:6718
FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
Definition sse2neon.h:1288
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
Definition sse2neon.h:2088
FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
Definition sse2neon.h:6147
FORCE_INLINE void _mm_pause(void)
Definition sse2neon.h:4922
FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
Definition sse2neon.h:1978
FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
Definition sse2neon.h:6003
FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
Definition sse2neon.h:7719
FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
Definition sse2neon.h:5550
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition sse2neon.h:3894
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition sse2neon.h:2235
FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
Definition sse2neon.h:6869
FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
Definition sse2neon.h:5988
FORCE_INLINE int _sse2neon_ctz(unsigned int x)
Definition sse2neon.h:8302
FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
Definition sse2neon.h:2115
FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
Definition sse2neon.h:3423
FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
Definition sse2neon.h:3608
#define SSE2NEON_AES_H0(x)
Definition sse2neon.h:8798
FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
Definition sse2neon.h:6233
FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
Definition sse2neon.h:5475
FORCE_INLINE int _mm_cmpestro(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition sse2neon.h:8443
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
Definition sse2neon.h:2040
FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
Definition sse2neon.h:6347
FORCE_INLINE void _mm_mfence(void)
Definition sse2neon.h:2593
FORCE_INLINE float _mm_cvtss_f32(__m128 a)
Definition sse2neon.h:1703
FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
Definition sse2neon.h:1445
#define _MM_FROUND_NO_EXC
Definition sse2neon.h:375
#define vreinterpretq_s64_m128i(x)
Definition sse2neon.h:478
FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
Definition sse2neon.h:4110
#define _MM_FLUSH_ZERO_OFF
Definition sse2neon.h:390
FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
Definition sse2neon.h:6528
SIMDVec
Definition sse2neon.h:577
FORCE_INLINE void _mm_clflush(void const *p)
Definition sse2neon.h:3201
Definition sse2neon.h:746
uint16_t res0
Definition sse2neon.h:747
uint8_t bit22
Definition sse2neon.h:749
uint8_t res1
Definition sse2neon.h:748
uint8_t bit23
Definition sse2neon.h:750
uint8_t res2
Definition sse2neon.h:752
uint8_t bit24
Definition sse2neon.h:751