From f96545acf37d293a3c0f230e2e077c20a6b595ae Mon Sep 17 00:00:00 2001 From: Peter Geis Date: Wed, 9 Oct 2019 16:08:56 +0000 Subject: [PATCH] libobs: Add aarch64 compatibility layer Add a compatibility layer utilizing simde (https://github.com/nemequ/simde) to allow compilation on aarch64 without modifying existing functions. --- libobs/util/aarch/check.h | 258 ++ libobs/util/aarch/hedley.h | 1616 ++++++++++++ libobs/util/aarch/mmx.h | 1356 ++++++++++ libobs/util/aarch/simde-arch.h | 355 +++ libobs/util/aarch/simde-common.h | 278 ++ libobs/util/aarch/sse.h | 2591 ++++++++++++++++++ libobs/util/aarch/sse2.h | 4197 ++++++++++++++++++++++++++++++ libobs/util/sse-intrin.h | 66 + 8 files changed, 10717 insertions(+) create mode 100644 libobs/util/aarch/check.h create mode 100644 libobs/util/aarch/hedley.h create mode 100644 libobs/util/aarch/mmx.h create mode 100644 libobs/util/aarch/simde-arch.h create mode 100644 libobs/util/aarch/simde-common.h create mode 100644 libobs/util/aarch/sse.h create mode 100644 libobs/util/aarch/sse2.h create mode 100644 libobs/util/sse-intrin.h diff --git a/libobs/util/aarch/check.h b/libobs/util/aarch/check.h new file mode 100644 index 000000000..2ad107ebf --- /dev/null +++ b/libobs/util/aarch/check.h @@ -0,0 +1,258 @@ +/* Check (assertions) + * Portable Snippets - https://gitub.com/nemequ/portable-snippets + * Created by Evan Nemerson + * + * To the extent possible under law, the authors have waived all + * copyright and related or neighboring rights to this code. For + * details, see the Creative Commons Zero 1.0 Universal license at + * https://creativecommons.org/publicdomain/zero/1.0/ + */ + +#if !defined(SIMDE_CHECK_H) +#define SIMDE_CHECK_H + +#if !defined(SIMDE_NDEBUG) && !defined(SIMDE_DEBUG) +#define SIMDE_NDEBUG 1 +#endif + +#include + +#if !defined(_WIN32) +#define SIMDE_SIZE_MODIFIER "z" +#define SIMDE_CHAR_MODIFIER "hh" +#define SIMDE_SHORT_MODIFIER "h" +#else +#if defined(_M_X64) || defined(__amd64__) +#define SIMDE_SIZE_MODIFIER "I64" +#else +#define SIMDE_SIZE_MODIFIER "" +#endif +#define SIMDE_CHAR_MODIFIER "" +#define SIMDE_SHORT_MODIFIER "" +#endif + +#if defined(_MSC_VER) && (_MSC_VER >= 1500) +#define SIMDE__PUSH_DISABLE_MSVC_C4127 \ + __pragma(warning(push)) __pragma(warning(disable : 4127)) +#define SIMDE__POP_DISABLE_MSVC_C4127 __pragma(warning(pop)) +#else +#define SIMDE__PUSH_DISABLE_MSVC_C4127 +#define SIMDE__POP_DISABLE_MSVC_C4127 +#endif + +#if !defined(simde_errorf) +#include +#include +#define simde_errorf(format, ...) \ + (fprintf(stderr, format, __VA_ARGS__), abort()) +#endif + +#define simde_error(msg) simde_errorf("%s", msg) + +#if defined(SIMDE_NDEBUG) +#if defined(SIMDE_CHECK_FAIL_DEFINED) +#define simde_assert(expr) +#else +#if defined(HEDLEY_ASSUME) +#define simde_assert(expr) HEDLEY_ASSUME(expr) +#elif HEDLEY_GCC_VERSION_CHECK(4, 5, 0) +#define simde_assert(expr) ((void)(!!(expr) ? 1 : (__builtin_unreachable(), 1))) +#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) +#define simde_assert(expr) __assume(expr) +#else +#define simde_assert(expr) +#endif +#endif +#define simde_assert_true(expr) simde_assert(expr) +#define simde_assert_false(expr) simde_assert(!(expr)) +#define simde_assert_type_full(prefix, suffix, T, fmt, a, op, b) \ + simde_assert(((a)op(b))) +#define simde_assert_double_equal(a, b, precision) +#define simde_assert_string_equal(a, b) +#define simde_assert_string_not_equal(a, b) +#define simde_assert_memory_equal(size, a, b) +#define simde_assert_memory_not_equal(size, a, b) +#else +#define simde_assert(expr) \ + do { \ + if (!HEDLEY_LIKELY(expr)) { \ + simde_error("assertion failed: " #expr "\n"); \ + } \ + SIMDE__PUSH_DISABLE_MSVC_C4127 \ + } while (0) SIMDE__POP_DISABLE_MSVC_C4127 + +#define simde_assert_true(expr) \ + do { \ + if (!HEDLEY_LIKELY(expr)) { \ + simde_error("assertion failed: " #expr \ + " is not true\n"); \ + } \ + SIMDE__PUSH_DISABLE_MSVC_C4127 \ + } while (0) SIMDE__POP_DISABLE_MSVC_C4127 + +#define simde_assert_false(expr) \ + do { \ + if (!HEDLEY_LIKELY(!(expr))) { \ + simde_error("assertion failed: " #expr \ + " is not false\n"); \ + } \ + SIMDE__PUSH_DISABLE_MSVC_C4127 \ + } while (0) SIMDE__POP_DISABLE_MSVC_C4127 + +#define simde_assert_type_full(prefix, suffix, T, fmt, a, op, b) \ + do { \ + T simde_tmp_a_ = (a); \ + T simde_tmp_b_ = (b); \ + if (!(simde_tmp_a_ op simde_tmp_b_)) { \ + simde_errorf("assertion failed: %s %s %s (" prefix \ + "%" fmt suffix " %s " prefix \ + "%" fmt suffix ")\n", \ + #a, #op, #b, simde_tmp_a_, #op, \ + simde_tmp_b_); \ + } \ + SIMDE__PUSH_DISABLE_MSVC_C4127 \ + } while (0) SIMDE__POP_DISABLE_MSVC_C4127 + +#define simde_assert_double_equal(a, b, precision) \ + do { \ + const double simde_tmp_a_ = (a); \ + const double simde_tmp_b_ = (b); \ + const double simde_tmp_diff_ = \ + ((simde_tmp_a_ - simde_tmp_b_) < 0) \ + ? -(simde_tmp_a_ - simde_tmp_b_) \ + : (simde_tmp_a_ - simde_tmp_b_); \ + if (HEDLEY_UNLIKELY(simde_tmp_diff_ > 1e-##precision)) { \ + simde_errorf( \ + "assertion failed: %s == %s (%0." #precision \ + "g == %0." #precision "g)\n", \ + #a, #b, simde_tmp_a_, simde_tmp_b_); \ + } \ + SIMDE__PUSH_DISABLE_MSVC_C4127 \ + } while (0) SIMDE__POP_DISABLE_MSVC_C4127 + +#include +#define simde_assert_string_equal(a, b) \ + do { \ + const char *simde_tmp_a_ = a; \ + const char *simde_tmp_b_ = b; \ + if (HEDLEY_UNLIKELY(strcmp(simde_tmp_a_, simde_tmp_b_) != \ + 0)) { \ + simde_errorf( \ + "assertion failed: string %s == %s (\"%s\" == \"%s\")\n", \ + #a, #b, simde_tmp_a_, simde_tmp_b_); \ + } \ + SIMDE__PUSH_DISABLE_MSVC_C4127 \ + } while (0) SIMDE__POP_DISABLE_MSVC_C4127 + +#define simde_assert_string_not_equal(a, b) \ + do { \ + const char *simde_tmp_a_ = a; \ + const char *simde_tmp_b_ = b; \ + if (HEDLEY_UNLIKELY(strcmp(simde_tmp_a_, simde_tmp_b_) == \ + 0)) { \ + simde_errorf( \ + "assertion failed: string %s != %s (\"%s\" == \"%s\")\n", \ + #a, #b, simde_tmp_a_, simde_tmp_b_); \ + } \ + SIMDE__PUSH_DISABLE_MSVC_C4127 \ + } while (0) SIMDE__POP_DISABLE_MSVC_C4127 + +#define simde_assert_memory_equal(size, a, b) \ + do { \ + const unsigned char *simde_tmp_a_ = \ + (const unsigned char *)(a); \ + const unsigned char *simde_tmp_b_ = \ + (const unsigned char *)(b); \ + const size_t simde_tmp_size_ = (size); \ + if (HEDLEY_UNLIKELY(memcmp(simde_tmp_a_, simde_tmp_b_, \ + simde_tmp_size_)) != 0) { \ + size_t simde_tmp_pos_; \ + for (simde_tmp_pos_ = 0; \ + simde_tmp_pos_ < simde_tmp_size_; \ + simde_tmp_pos_++) { \ + if (simde_tmp_a_[simde_tmp_pos_] != \ + simde_tmp_b_[simde_tmp_pos_]) { \ + simde_errorf( \ + "assertion failed: memory %s == %s, at offset %" SIMDE_SIZE_MODIFIER \ + "u\n", \ + #a, #b, simde_tmp_pos_); \ + break; \ + } \ + } \ + } \ + SIMDE__PUSH_DISABLE_MSVC_C4127 \ + } while (0) SIMDE__POP_DISABLE_MSVC_C4127 + +#define simde_assert_memory_not_equal(size, a, b) \ + do { \ + const unsigned char *simde_tmp_a_ = \ + (const unsigned char *)(a); \ + const unsigned char *simde_tmp_b_ = \ + (const unsigned char *)(b); \ + const size_t simde_tmp_size_ = (size); \ + if (HEDLEY_UNLIKELY(memcmp(simde_tmp_a_, simde_tmp_b_, \ + simde_tmp_size_)) == 0) { \ + simde_errorf( \ + "assertion failed: memory %s != %s (%" SIMDE_SIZE_MODIFIER \ + "u bytes)\n", \ + #a, #b, simde_tmp_size_); \ + } \ + SIMDE__PUSH_DISABLE_MSVC_C4127 \ + } while (0) SIMDE__POP_DISABLE_MSVC_C4127 +#endif + +#define simde_assert_type(T, fmt, a, op, b) \ + simde_assert_type_full("", "", T, fmt, a, op, b) + +#define simde_assert_char(a, op, b) \ + simde_assert_type_full("'\\x", "'", char, \ + "02" SIMDE_CHAR_MODIFIER "x", a, op, b) +#define simde_assert_uchar(a, op, b) \ + simde_assert_type_full("'\\x", "'", unsigned char, \ + "02" SIMDE_CHAR_MODIFIER "x", a, op, b) +#define simde_assert_short(a, op, b) \ + simde_assert_type(short, SIMDE_SHORT_MODIFIER "d", a, op, b) +#define simde_assert_ushort(a, op, b) \ + simde_assert_type(unsigned short, SIMDE_SHORT_MODIFIER "u", a, op, b) +#define simde_assert_int(a, op, b) simde_assert_type(int, "d", a, op, b) +#define simde_assert_uint(a, op, b) \ + simde_assert_type(unsigned int, "u", a, op, b) +#define simde_assert_long(a, op, b) simde_assert_type(long int, "ld", a, op, b) +#define simde_assert_ulong(a, op, b) \ + simde_assert_type(unsigned long int, "lu", a, op, b) +#define simde_assert_llong(a, op, b) \ + simde_assert_type(long long int, "lld", a, op, b) +#define simde_assert_ullong(a, op, b) \ + simde_assert_type(unsigned long long int, "llu", a, op, b) + +#define simde_assert_size(a, op, b) \ + simde_assert_type(size_t, SIMDE_SIZE_MODIFIER "u", a, op, b) + +#define simde_assert_float(a, op, b) simde_assert_type(float, "f", a, op, b) +#define simde_assert_double(a, op, b) simde_assert_type(double, "g", a, op, b) +#define simde_assert_ptr(a, op, b) \ + simde_assert_type(const void *, "p", a, op, b) + +#define simde_assert_int8(a, op, b) simde_assert_type(int8_t, PRIi8, a, op, b) +#define simde_assert_uint8(a, op, b) simde_assert_type(uint8_t, PRIu8, a, op, b) +#define simde_assert_int16(a, op, b) \ + simde_assert_type(int16_t, PRIi16, a, op, b) +#define simde_assert_uint16(a, op, b) \ + simde_assert_type(uint16_t, PRIu16, a, op, b) +#define simde_assert_int32(a, op, b) \ + simde_assert_type(int32_t, PRIi32, a, op, b) +#define simde_assert_uint32(a, op, b) \ + simde_assert_type(uint32_t, PRIu32, a, op, b) +#define simde_assert_int64(a, op, b) \ + simde_assert_type(int64_t, PRIi64, a, op, b) +#define simde_assert_uint64(a, op, b) \ + simde_assert_type(uint64_t, PRIu64, a, op, b) + +#define simde_assert_ptr_equal(a, b) simde_assert_ptr(a, ==, b) +#define simde_assert_ptr_not_equal(a, b) simde_assert_ptr(a, !=, b) +#define simde_assert_null(ptr) simde_assert_ptr(ptr, ==, NULL) +#define simde_assert_not_null(ptr) simde_assert_ptr(ptr, !=, NULL) +#define simde_assert_ptr_null(ptr) simde_assert_ptr(ptr, ==, NULL) +#define simde_assert_ptr_not_null(ptr) simde_assert_ptr(ptr, !=, NULL) + +#endif /* !defined(SIMDE_CHECK_H) */ diff --git a/libobs/util/aarch/hedley.h b/libobs/util/aarch/hedley.h new file mode 100644 index 000000000..3c9cc49de --- /dev/null +++ b/libobs/util/aarch/hedley.h @@ -0,0 +1,1616 @@ +/* Hedley - https://nemequ.github.io/hedley + * Created by Evan Nemerson + * + * To the extent possible under law, the author(s) have dedicated all + * copyright and related and neighboring rights to this software to + * the public domain worldwide. This software is distributed without + * any warranty. + * + * For details, see . + * SPDX-License-Identifier: CC0-1.0 + */ + +#if !defined(HEDLEY_VERSION) || (HEDLEY_VERSION < 10) +#if defined(HEDLEY_VERSION) +#undef HEDLEY_VERSION +#endif +#define HEDLEY_VERSION 10 + +#if defined(HEDLEY_STRINGIFY_EX) +#undef HEDLEY_STRINGIFY_EX +#endif +#define HEDLEY_STRINGIFY_EX(x) #x + +#if defined(HEDLEY_STRINGIFY) +#undef HEDLEY_STRINGIFY +#endif +#define HEDLEY_STRINGIFY(x) HEDLEY_STRINGIFY_EX(x) + +#if defined(HEDLEY_CONCAT_EX) +#undef HEDLEY_CONCAT_EX +#endif +#define HEDLEY_CONCAT_EX(a, b) a##b + +#if defined(HEDLEY_CONCAT) +#undef HEDLEY_CONCAT +#endif +#define HEDLEY_CONCAT(a, b) HEDLEY_CONCAT_EX(a, b) + +#if defined(HEDLEY_VERSION_ENCODE) +#undef HEDLEY_VERSION_ENCODE +#endif +#define HEDLEY_VERSION_ENCODE(major, minor, revision) \ + (((major)*1000000) + ((minor)*1000) + (revision)) + +#if defined(HEDLEY_VERSION_DECODE_MAJOR) +#undef HEDLEY_VERSION_DECODE_MAJOR +#endif +#define HEDLEY_VERSION_DECODE_MAJOR(version) ((version) / 1000000) + +#if defined(HEDLEY_VERSION_DECODE_MINOR) +#undef HEDLEY_VERSION_DECODE_MINOR +#endif +#define HEDLEY_VERSION_DECODE_MINOR(version) (((version) % 1000000) / 1000) + +#if defined(HEDLEY_VERSION_DECODE_REVISION) +#undef HEDLEY_VERSION_DECODE_REVISION +#endif +#define HEDLEY_VERSION_DECODE_REVISION(version) ((version) % 1000) + +#if defined(HEDLEY_GNUC_VERSION) +#undef HEDLEY_GNUC_VERSION +#endif +#if defined(__GNUC__) && defined(__GNUC_PATCHLEVEL__) +#define HEDLEY_GNUC_VERSION \ + HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__) +#elif defined(__GNUC__) +#define HEDLEY_GNUC_VERSION HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, 0) +#endif + +#if defined(HEDLEY_GNUC_VERSION_CHECK) +#undef HEDLEY_GNUC_VERSION_CHECK +#endif +#if defined(HEDLEY_GNUC_VERSION) +#define HEDLEY_GNUC_VERSION_CHECK(major, minor, patch) \ + (HEDLEY_GNUC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +#define HEDLEY_GNUC_VERSION_CHECK(major, minor, patch) (0) +#endif + +#if defined(HEDLEY_MSVC_VERSION) +#undef HEDLEY_MSVC_VERSION +#endif +#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000) +#define HEDLEY_MSVC_VERSION \ + HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 10000000, \ + (_MSC_FULL_VER % 10000000) / 100000, \ + (_MSC_FULL_VER % 100000) / 100) +#elif defined(_MSC_FULL_VER) +#define HEDLEY_MSVC_VERSION \ + HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 1000000, \ + (_MSC_FULL_VER % 1000000) / 10000, \ + (_MSC_FULL_VER % 10000) / 10) +#elif defined(_MSC_VER) +#define HEDLEY_MSVC_VERSION \ + HEDLEY_VERSION_ENCODE(_MSC_VER / 100, _MSC_VER % 100, 0) +#endif + +#if defined(HEDLEY_MSVC_VERSION_CHECK) +#undef HEDLEY_MSVC_VERSION_CHECK +#endif +#if !defined(_MSC_VER) +#define HEDLEY_MSVC_VERSION_CHECK(major, minor, patch) (0) +#elif defined(_MSC_VER) && (_MSC_VER >= 1400) +#define HEDLEY_MSVC_VERSION_CHECK(major, minor, patch) \ + (_MSC_FULL_VER >= ((major * 10000000) + (minor * 100000) + (patch))) +#elif defined(_MSC_VER) && (_MSC_VER >= 1200) +#define HEDLEY_MSVC_VERSION_CHECK(major, minor, patch) \ + (_MSC_FULL_VER >= ((major * 1000000) + (minor * 10000) + (patch))) +#else +#define HEDLEY_MSVC_VERSION_CHECK(major, minor, patch) \ + (_MSC_VER >= ((major * 100) + (minor))) +#endif + +#if defined(HEDLEY_INTEL_VERSION) +#undef HEDLEY_INTEL_VERSION +#endif +#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) +#define HEDLEY_INTEL_VERSION \ + HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, \ + __INTEL_COMPILER_UPDATE) +#elif defined(__INTEL_COMPILER) +#define HEDLEY_INTEL_VERSION \ + HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, 0) +#endif + +#if defined(HEDLEY_INTEL_VERSION_CHECK) +#undef HEDLEY_INTEL_VERSION_CHECK +#endif +#if defined(HEDLEY_INTEL_VERSION) +#define HEDLEY_INTEL_VERSION_CHECK(major, minor, patch) \ + (HEDLEY_INTEL_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +#define HEDLEY_INTEL_VERSION_CHECK(major, minor, patch) (0) +#endif + +#if defined(HEDLEY_PGI_VERSION) +#undef HEDLEY_PGI_VERSION +#endif +#if defined(__PGI) && defined(__PGIC__) && defined(__PGIC_MINOR__) && \ + defined(__PGIC_PATCHLEVEL__) +#define HEDLEY_PGI_VERSION \ + HEDLEY_VERSION_ENCODE(__PGIC__, __PGIC_MINOR__, __PGIC_PATCHLEVEL__) +#endif + +#if defined(HEDLEY_PGI_VERSION_CHECK) +#undef HEDLEY_PGI_VERSION_CHECK +#endif +#if defined(HEDLEY_PGI_VERSION) +#define HEDLEY_PGI_VERSION_CHECK(major, minor, patch) \ + (HEDLEY_PGI_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +#define HEDLEY_PGI_VERSION_CHECK(major, minor, patch) (0) +#endif + +#if defined(HEDLEY_SUNPRO_VERSION) +#undef HEDLEY_SUNPRO_VERSION +#endif +#if defined(__SUNPRO_C) && (__SUNPRO_C > 0x1000) +#define HEDLEY_SUNPRO_VERSION \ + HEDLEY_VERSION_ENCODE((((__SUNPRO_C >> 16) & 0xf) * 10) + \ + ((__SUNPRO_C >> 12) & 0xf), \ + (((__SUNPRO_C >> 8) & 0xf) * 10) + \ + ((__SUNPRO_C >> 4) & 0xf), \ + (__SUNPRO_C & 0xf) * 10) +#elif defined(__SUNPRO_C) +#define HEDLEY_SUNPRO_VERSION \ + HEDLEY_VERSION_ENCODE((__SUNPRO_C >> 8) & 0xf, \ + (__SUNPRO_C >> 4) & 0xf, (__SUNPRO_C)&0xf) +#elif defined(__SUNPRO_CC) && (__SUNPRO_CC > 0x1000) +#define HEDLEY_SUNPRO_VERSION \ + HEDLEY_VERSION_ENCODE((((__SUNPRO_CC >> 16) & 0xf) * 10) + \ + ((__SUNPRO_CC >> 12) & 0xf), \ + (((__SUNPRO_CC >> 8) & 0xf) * 10) + \ + ((__SUNPRO_CC >> 4) & 0xf), \ + (__SUNPRO_CC & 0xf) * 10) +#elif defined(__SUNPRO_CC) +#define HEDLEY_SUNPRO_VERSION \ + HEDLEY_VERSION_ENCODE((__SUNPRO_CC >> 8) & 0xf, \ + (__SUNPRO_CC >> 4) & 0xf, (__SUNPRO_CC)&0xf) +#endif + +#if defined(HEDLEY_SUNPRO_VERSION_CHECK) +#undef HEDLEY_SUNPRO_VERSION_CHECK +#endif +#if defined(HEDLEY_SUNPRO_VERSION) +#define HEDLEY_SUNPRO_VERSION_CHECK(major, minor, patch) \ + (HEDLEY_SUNPRO_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +#define HEDLEY_SUNPRO_VERSION_CHECK(major, minor, patch) (0) +#endif + +#if defined(HEDLEY_EMSCRIPTEN_VERSION) +#undef HEDLEY_EMSCRIPTEN_VERSION +#endif +#if defined(__EMSCRIPTEN__) +#define HEDLEY_EMSCRIPTEN_VERSION \ + HEDLEY_VERSION_ENCODE(__EMSCRIPTEN_major__, __EMSCRIPTEN_minor__, \ + __EMSCRIPTEN_tiny__) +#endif + +#if defined(HEDLEY_EMSCRIPTEN_VERSION_CHECK) +#undef HEDLEY_EMSCRIPTEN_VERSION_CHECK +#endif +#if defined(HEDLEY_EMSCRIPTEN_VERSION) +#define HEDLEY_EMSCRIPTEN_VERSION_CHECK(major, minor, patch) \ + (HEDLEY_EMSCRIPTEN_VERSION >= \ + HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +#define HEDLEY_EMSCRIPTEN_VERSION_CHECK(major, minor, patch) (0) +#endif + +#if defined(HEDLEY_ARM_VERSION) +#undef HEDLEY_ARM_VERSION +#endif +#if defined(__CC_ARM) && defined(__ARMCOMPILER_VERSION) +#define HEDLEY_ARM_VERSION \ + HEDLEY_VERSION_ENCODE(__ARMCOMPILER_VERSION / 1000000, \ + (__ARMCOMPILER_VERSION % 1000000) / 10000, \ + (__ARMCOMPILER_VERSION % 10000) / 100) +#elif defined(__CC_ARM) && defined(__ARMCC_VERSION) +#define HEDLEY_ARM_VERSION \ + HEDLEY_VERSION_ENCODE(__ARMCC_VERSION / 1000000, \ + (__ARMCC_VERSION % 1000000) / 10000, \ + (__ARMCC_VERSION % 10000) / 100) +#endif + +#if defined(HEDLEY_ARM_VERSION_CHECK) +#undef HEDLEY_ARM_VERSION_CHECK +#endif +#if defined(HEDLEY_ARM_VERSION) +#define HEDLEY_ARM_VERSION_CHECK(major, minor, patch) \ + (HEDLEY_ARM_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +#define HEDLEY_ARM_VERSION_CHECK(major, minor, patch) (0) +#endif + +#if defined(HEDLEY_IBM_VERSION) +#undef HEDLEY_IBM_VERSION +#endif +#if defined(__ibmxl__) +#define HEDLEY_IBM_VERSION \ + HEDLEY_VERSION_ENCODE(__ibmxl_version__, __ibmxl_release__, \ + __ibmxl_modification__) +#elif defined(__xlC__) && defined(__xlC_ver__) +#define HEDLEY_IBM_VERSION \ + HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, \ + (__xlC_ver__ >> 8) & 0xff) +#elif defined(__xlC__) +#define HEDLEY_IBM_VERSION \ + HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, 0) +#endif + +#if defined(HEDLEY_IBM_VERSION_CHECK) +#undef HEDLEY_IBM_VERSION_CHECK +#endif +#if defined(HEDLEY_IBM_VERSION) +#define HEDLEY_IBM_VERSION_CHECK(major, minor, patch) \ + (HEDLEY_IBM_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +#define HEDLEY_IBM_VERSION_CHECK(major, minor, patch) (0) +#endif + +#if defined(HEDLEY_TI_VERSION) +#undef HEDLEY_TI_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) +#define HEDLEY_TI_VERSION \ + HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, \ + (__TI_COMPILER_VERSION__ % 1000000) / 1000, \ + (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(HEDLEY_TI_VERSION_CHECK) +#undef HEDLEY_TI_VERSION_CHECK +#endif +#if defined(HEDLEY_TI_VERSION) +#define HEDLEY_TI_VERSION_CHECK(major, minor, patch) \ + (HEDLEY_TI_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +#define HEDLEY_TI_VERSION_CHECK(major, minor, patch) (0) +#endif + +#if defined(HEDLEY_CRAY_VERSION) +#undef HEDLEY_CRAY_VERSION +#endif +#if defined(_CRAYC) +#if defined(_RELEASE_PATCHLEVEL) +#define HEDLEY_CRAY_VERSION \ + HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, \ + _RELEASE_PATCHLEVEL) +#else +#define HEDLEY_CRAY_VERSION \ + HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, 0) +#endif +#endif + +#if defined(HEDLEY_CRAY_VERSION_CHECK) +#undef HEDLEY_CRAY_VERSION_CHECK +#endif +#if defined(HEDLEY_CRAY_VERSION) +#define HEDLEY_CRAY_VERSION_CHECK(major, minor, patch) \ + (HEDLEY_CRAY_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +#define HEDLEY_CRAY_VERSION_CHECK(major, minor, patch) (0) +#endif + +#if defined(HEDLEY_IAR_VERSION) +#undef HEDLEY_IAR_VERSION +#endif +#if defined(__IAR_SYSTEMS_ICC__) +#if __VER__ > 1000 +#define HEDLEY_IAR_VERSION \ + HEDLEY_VERSION_ENCODE((__VER__ / 1000000), ((__VER__ / 1000) % 1000), \ + (__VER__ % 1000)) +#else +#define HEDLEY_IAR_VERSION HEDLEY_VERSION_ENCODE(VER / 100, __VER__ % 100, 0) +#endif +#endif + +#if defined(HEDLEY_IAR_VERSION_CHECK) +#undef HEDLEY_IAR_VERSION_CHECK +#endif +#if defined(HEDLEY_IAR_VERSION) +#define HEDLEY_IAR_VERSION_CHECK(major, minor, patch) \ + (HEDLEY_IAR_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +#define HEDLEY_IAR_VERSION_CHECK(major, minor, patch) (0) +#endif + +#if defined(HEDLEY_TINYC_VERSION) +#undef HEDLEY_TINYC_VERSION +#endif +#if defined(__TINYC__) +#define HEDLEY_TINYC_VERSION \ + HEDLEY_VERSION_ENCODE(__TINYC__ / 1000, (__TINYC__ / 100) % 10, \ + __TINYC__ % 100) +#endif + +#if defined(HEDLEY_TINYC_VERSION_CHECK) +#undef HEDLEY_TINYC_VERSION_CHECK +#endif +#if defined(HEDLEY_TINYC_VERSION) +#define HEDLEY_TINYC_VERSION_CHECK(major, minor, patch) \ + (HEDLEY_TINYC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +#define HEDLEY_TINYC_VERSION_CHECK(major, minor, patch) (0) +#endif + +#if defined(HEDLEY_DMC_VERSION) +#undef HEDLEY_DMC_VERSION +#endif +#if defined(__DMC__) +#define HEDLEY_DMC_VERSION \ + HEDLEY_VERSION_ENCODE(__DMC__ >> 8, (__DMC__ >> 4) & 0xf, __DMC__ & 0xf) +#endif + +#if defined(HEDLEY_DMC_VERSION_CHECK) +#undef HEDLEY_DMC_VERSION_CHECK +#endif +#if defined(HEDLEY_DMC_VERSION) +#define HEDLEY_DMC_VERSION_CHECK(major, minor, patch) \ + (HEDLEY_DMC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +#define HEDLEY_DMC_VERSION_CHECK(major, minor, patch) (0) +#endif + +#if defined(HEDLEY_COMPCERT_VERSION) +#undef HEDLEY_COMPCERT_VERSION +#endif +#if defined(__COMPCERT_VERSION__) +#define HEDLEY_COMPCERT_VERSION \ + HEDLEY_VERSION_ENCODE(__COMPCERT_VERSION__ / 10000, \ + (__COMPCERT_VERSION__ / 100) % 100, \ + __COMPCERT_VERSION__ % 100) +#endif + +#if defined(HEDLEY_COMPCERT_VERSION_CHECK) +#undef HEDLEY_COMPCERT_VERSION_CHECK +#endif +#if defined(HEDLEY_COMPCERT_VERSION) +#define HEDLEY_COMPCERT_VERSION_CHECK(major, minor, patch) \ + (HEDLEY_COMPCERT_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +#define HEDLEY_COMPCERT_VERSION_CHECK(major, minor, patch) (0) +#endif + +#if defined(HEDLEY_PELLES_VERSION) +#undef HEDLEY_PELLES_VERSION +#endif +#if defined(__POCC__) +#define HEDLEY_PELLES_VERSION \ + HEDLEY_VERSION_ENCODE(__POCC__ / 100, __POCC__ % 100, 0) +#endif + +#if defined(HEDLEY_PELLES_VERSION_CHECK) +#undef HEDLEY_PELLES_VERSION_CHECK +#endif +#if defined(HEDLEY_PELLES_VERSION) +#define HEDLEY_PELLES_VERSION_CHECK(major, minor, patch) \ + (HEDLEY_PELLES_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +#define HEDLEY_PELLES_VERSION_CHECK(major, minor, patch) (0) +#endif + +#if defined(HEDLEY_GCC_VERSION) +#undef HEDLEY_GCC_VERSION +#endif +#if defined(HEDLEY_GNUC_VERSION) && !defined(__clang__) && \ + !defined(HEDLEY_INTEL_VERSION) && !defined(HEDLEY_PGI_VERSION) && \ + !defined(HEDLEY_ARM_VERSION) && !defined(HEDLEY_TI_VERSION) && \ + !defined(__COMPCERT__) +#define HEDLEY_GCC_VERSION HEDLEY_GNUC_VERSION +#endif + +#if defined(HEDLEY_GCC_VERSION_CHECK) +#undef HEDLEY_GCC_VERSION_CHECK +#endif +#if defined(HEDLEY_GCC_VERSION) +#define HEDLEY_GCC_VERSION_CHECK(major, minor, patch) \ + (HEDLEY_GCC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +#define HEDLEY_GCC_VERSION_CHECK(major, minor, patch) (0) +#endif + +#if defined(HEDLEY_HAS_ATTRIBUTE) +#undef HEDLEY_HAS_ATTRIBUTE +#endif +#if defined(__has_attribute) +#define HEDLEY_HAS_ATTRIBUTE(attribute) __has_attribute(attribute) +#else +#define HEDLEY_HAS_ATTRIBUTE(attribute) (0) +#endif + +#if defined(HEDLEY_GNUC_HAS_ATTRIBUTE) +#undef HEDLEY_GNUC_HAS_ATTRIBUTE +#endif +#if defined(__has_attribute) +#define HEDLEY_GNUC_HAS_ATTRIBUTE(attribute, major, minor, patch) \ + __has_attribute(attribute) +#else +#define HEDLEY_GNUC_HAS_ATTRIBUTE(attribute, major, minor, patch) \ + HEDLEY_GNUC_VERSION_CHECK(major, minor, patch) +#endif + +#if defined(HEDLEY_GCC_HAS_ATTRIBUTE) +#undef HEDLEY_GCC_HAS_ATTRIBUTE +#endif +#if defined(__has_attribute) +#define HEDLEY_GCC_HAS_ATTRIBUTE(attribute, major, minor, patch) \ + __has_attribute(attribute) +#else +#define HEDLEY_GCC_HAS_ATTRIBUTE(attribute, major, minor, patch) \ + HEDLEY_GCC_VERSION_CHECK(major, minor, patch) +#endif + +#if defined(HEDLEY_HAS_CPP_ATTRIBUTE) +#undef HEDLEY_HAS_CPP_ATTRIBUTE +#endif +#if defined(__has_cpp_attribute) && defined(__cplusplus) +#define HEDLEY_HAS_CPP_ATTRIBUTE(attribute) __has_cpp_attribute(attribute) +#else +#define HEDLEY_HAS_CPP_ATTRIBUTE(attribute) (0) +#endif + +#if defined(HEDLEY_GNUC_HAS_CPP_ATTRIBUTE) +#undef HEDLEY_GNUC_HAS_CPP_ATTRIBUTE +#endif +#if defined(__has_cpp_attribute) && defined(__cplusplus) +#define HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute, major, minor, patch) \ + __has_cpp_attribute(attribute) +#else +#define HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute, major, minor, patch) \ + HEDLEY_GNUC_VERSION_CHECK(major, minor, patch) +#endif + +#if defined(HEDLEY_GCC_HAS_CPP_ATTRIBUTE) +#undef HEDLEY_GCC_HAS_CPP_ATTRIBUTE +#endif +#if defined(__has_cpp_attribute) && defined(__cplusplus) +#define HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute, major, minor, patch) \ + __has_cpp_attribute(attribute) +#else +#define HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute, major, minor, patch) \ + HEDLEY_GCC_VERSION_CHECK(major, minor, patch) +#endif + +#if defined(HEDLEY_HAS_BUILTIN) +#undef HEDLEY_HAS_BUILTIN +#endif +#if defined(__has_builtin) +#define HEDLEY_HAS_BUILTIN(builtin) __has_builtin(builtin) +#else +#define HEDLEY_HAS_BUILTIN(builtin) (0) +#endif + +#if defined(HEDLEY_GNUC_HAS_BUILTIN) +#undef HEDLEY_GNUC_HAS_BUILTIN +#endif +#if defined(__has_builtin) +#define HEDLEY_GNUC_HAS_BUILTIN(builtin, major, minor, patch) \ + __has_builtin(builtin) +#else +#define HEDLEY_GNUC_HAS_BUILTIN(builtin, major, minor, patch) \ + HEDLEY_GNUC_VERSION_CHECK(major, minor, patch) +#endif + +#if defined(HEDLEY_GCC_HAS_BUILTIN) +#undef HEDLEY_GCC_HAS_BUILTIN +#endif +#if defined(__has_builtin) +#define HEDLEY_GCC_HAS_BUILTIN(builtin, major, minor, patch) \ + __has_builtin(builtin) +#else +#define HEDLEY_GCC_HAS_BUILTIN(builtin, major, minor, patch) \ + HEDLEY_GCC_VERSION_CHECK(major, minor, patch) +#endif + +#if defined(HEDLEY_HAS_FEATURE) +#undef HEDLEY_HAS_FEATURE +#endif +#if defined(__has_feature) +#define HEDLEY_HAS_FEATURE(feature) __has_feature(feature) +#else +#define HEDLEY_HAS_FEATURE(feature) (0) +#endif + +#if defined(HEDLEY_GNUC_HAS_FEATURE) +#undef HEDLEY_GNUC_HAS_FEATURE +#endif +#if defined(__has_feature) +#define HEDLEY_GNUC_HAS_FEATURE(feature, major, minor, patch) \ + __has_feature(feature) +#else +#define HEDLEY_GNUC_HAS_FEATURE(feature, major, minor, patch) \ + HEDLEY_GNUC_VERSION_CHECK(major, minor, patch) +#endif + +#if defined(HEDLEY_GCC_HAS_FEATURE) +#undef HEDLEY_GCC_HAS_FEATURE +#endif +#if defined(__has_feature) +#define HEDLEY_GCC_HAS_FEATURE(feature, major, minor, patch) \ + __has_feature(feature) +#else +#define HEDLEY_GCC_HAS_FEATURE(feature, major, minor, patch) \ + HEDLEY_GCC_VERSION_CHECK(major, minor, patch) +#endif + +#if defined(HEDLEY_HAS_EXTENSION) +#undef HEDLEY_HAS_EXTENSION +#endif +#if defined(__has_extension) +#define HEDLEY_HAS_EXTENSION(extension) __has_extension(extension) +#else +#define HEDLEY_HAS_EXTENSION(extension) (0) +#endif + +#if defined(HEDLEY_GNUC_HAS_EXTENSION) +#undef HEDLEY_GNUC_HAS_EXTENSION +#endif +#if defined(__has_extension) +#define HEDLEY_GNUC_HAS_EXTENSION(extension, major, minor, patch) \ + __has_extension(extension) +#else +#define HEDLEY_GNUC_HAS_EXTENSION(extension, major, minor, patch) \ + HEDLEY_GNUC_VERSION_CHECK(major, minor, patch) +#endif + +#if defined(HEDLEY_GCC_HAS_EXTENSION) +#undef HEDLEY_GCC_HAS_EXTENSION +#endif +#if defined(__has_extension) +#define HEDLEY_GCC_HAS_EXTENSION(extension, major, minor, patch) \ + __has_extension(extension) +#else +#define HEDLEY_GCC_HAS_EXTENSION(extension, major, minor, patch) \ + HEDLEY_GCC_VERSION_CHECK(major, minor, patch) +#endif + +#if defined(HEDLEY_HAS_DECLSPEC_ATTRIBUTE) +#undef HEDLEY_HAS_DECLSPEC_ATTRIBUTE +#endif +#if defined(__has_declspec_attribute) +#define HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) \ + __has_declspec_attribute(attribute) +#else +#define HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) (0) +#endif + +#if defined(HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE) +#undef HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE +#endif +#if defined(__has_declspec_attribute) +#define HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute, major, minor, patch) \ + __has_declspec_attribute(attribute) +#else +#define HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute, major, minor, patch) \ + HEDLEY_GNUC_VERSION_CHECK(major, minor, patch) +#endif + +#if defined(HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE) +#undef HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE +#endif +#if defined(__has_declspec_attribute) +#define HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute, major, minor, patch) \ + __has_declspec_attribute(attribute) +#else +#define HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute, major, minor, patch) \ + HEDLEY_GCC_VERSION_CHECK(major, minor, patch) +#endif + +#if defined(HEDLEY_HAS_WARNING) +#undef HEDLEY_HAS_WARNING +#endif +#if defined(__has_warning) +#define HEDLEY_HAS_WARNING(warning) __has_warning(warning) +#else +#define HEDLEY_HAS_WARNING(warning) (0) +#endif + +#if defined(HEDLEY_GNUC_HAS_WARNING) +#undef HEDLEY_GNUC_HAS_WARNING +#endif +#if defined(__has_warning) +#define HEDLEY_GNUC_HAS_WARNING(warning, major, minor, patch) \ + __has_warning(warning) +#else +#define HEDLEY_GNUC_HAS_WARNING(warning, major, minor, patch) \ + HEDLEY_GNUC_VERSION_CHECK(major, minor, patch) +#endif + +#if defined(HEDLEY_GCC_HAS_WARNING) +#undef HEDLEY_GCC_HAS_WARNING +#endif +#if defined(__has_warning) +#define HEDLEY_GCC_HAS_WARNING(warning, major, minor, patch) \ + __has_warning(warning) +#else +#define HEDLEY_GCC_HAS_WARNING(warning, major, minor, patch) \ + HEDLEY_GCC_VERSION_CHECK(major, minor, patch) +#endif + +#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \ + defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(3, 0, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ + HEDLEY_IAR_VERSION_CHECK(8, 0, 0) || \ + HEDLEY_PGI_VERSION_CHECK(18, 4, 0) || \ + HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ + HEDLEY_TI_VERSION_CHECK(6, 0, 0) || \ + HEDLEY_CRAY_VERSION_CHECK(5, 0, 0) || \ + HEDLEY_TINYC_VERSION_CHECK(0, 9, 17) || \ + HEDLEY_SUNPRO_VERSION_CHECK(8, 0, 0) || \ + (HEDLEY_IBM_VERSION_CHECK(10, 1, 0) && defined(__C99_PRAGMA_OPERATOR)) +#define HEDLEY_PRAGMA(value) _Pragma(#value) +#elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0) +#define HEDLEY_PRAGMA(value) __pragma(value) +#else +#define HEDLEY_PRAGMA(value) +#endif + +#if defined(HEDLEY_DIAGNOSTIC_PUSH) +#undef HEDLEY_DIAGNOSTIC_PUSH +#endif +#if defined(HEDLEY_DIAGNOSTIC_POP) +#undef HEDLEY_DIAGNOSTIC_POP +#endif +#if defined(__clang__) +#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push") +#define HEDLEY_DIAGNOSTIC_POP _Pragma("clang diagnostic pop") +#elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) +#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)") +#define HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)") +#elif HEDLEY_GCC_VERSION_CHECK(4, 6, 0) +#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push") +#define HEDLEY_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop") +#elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0) +#define HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(push)) +#define HEDLEY_DIAGNOSTIC_POP __pragma(warning(pop)) +#elif HEDLEY_ARM_VERSION_CHECK(5, 6, 0) +#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("push") +#define HEDLEY_DIAGNOSTIC_POP _Pragma("pop") +#elif HEDLEY_TI_VERSION_CHECK(8, 1, 0) +#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push") +#define HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop") +#elif HEDLEY_PELLES_VERSION_CHECK(2, 90, 0) +#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)") +#define HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)") +#else +#define HEDLEY_DIAGNOSTIC_PUSH +#define HEDLEY_DIAGNOSTIC_POP +#endif + +#if defined(HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED) +#undef HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED +#endif +#if HEDLEY_HAS_WARNING("-Wdeprecated-declarations") +#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED \ + _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") +#elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) +#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED \ + _Pragma("warning(disable:1478 1786)") +#elif HEDLEY_PGI_VERSION_CHECK(17, 10, 0) +#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444") +#elif HEDLEY_GCC_VERSION_CHECK(4, 3, 0) +#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED \ + _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") +#elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0) +#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable : 4996)) +#elif HEDLEY_TI_VERSION_CHECK(8, 0, 0) +#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1291,1718") +#elif HEDLEY_SUNPRO_VERSION_CHECK(5, 13, 0) && !defined(__cplusplus) +#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED \ + _Pragma("error_messages(off,E_DEPRECATED_ATT,E_DEPRECATED_ATT_MESS)") +#elif HEDLEY_SUNPRO_VERSION_CHECK(5, 13, 0) && defined(__cplusplus) +#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED \ + _Pragma("error_messages(off,symdeprecated,symdeprecated2)") +#elif HEDLEY_IAR_VERSION_CHECK(8, 0, 0) +#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED \ + _Pragma("diag_suppress=Pe1444,Pe1215") +#elif HEDLEY_PELLES_VERSION_CHECK(2, 90, 0) +#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warn(disable:2241)") +#else +#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED +#endif + +#if defined(HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS) +#undef HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS +#endif +#if HEDLEY_HAS_WARNING("-Wunknown-pragmas") +#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \ + _Pragma("clang diagnostic ignored \"-Wunknown-pragmas\"") +#elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) +#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \ + _Pragma("warning(disable:161)") +#elif HEDLEY_PGI_VERSION_CHECK(17, 10, 0) +#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 1675") +#elif HEDLEY_GCC_VERSION_CHECK(4, 3, 0) +#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \ + _Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"") +#elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0) +#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \ + __pragma(warning(disable : 4068)) +#elif HEDLEY_TI_VERSION_CHECK(8, 0, 0) +#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163") +#elif HEDLEY_IAR_VERSION_CHECK(8, 0, 0) +#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress=Pe161") +#else +#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS +#endif + +#if defined(HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL) +#undef HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL +#endif +#if HEDLEY_HAS_WARNING("-Wcast-qual") +#define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \ + _Pragma("clang diagnostic ignored \"-Wcast-qual\"") +#elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) +#define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \ + _Pragma("warning(disable:2203 2331)") +#elif HEDLEY_GCC_VERSION_CHECK(3, 0, 0) +#define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \ + _Pragma("GCC diagnostic ignored \"-Wcast-qual\"") +#else +#define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL +#endif + +#if defined(HEDLEY_DEPRECATED) +#undef HEDLEY_DEPRECATED +#endif +#if defined(HEDLEY_DEPRECATED_FOR) +#undef HEDLEY_DEPRECATED_FOR +#endif +#if defined(__cplusplus) && (__cplusplus >= 201402L) +#define HEDLEY_DEPRECATED(since) [[deprecated("Since " #since)]] +#define HEDLEY_DEPRECATED_FOR(since, replacement) \ + [[deprecated("Since " #since "; use " #replacement)]] +#elif HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) || \ + HEDLEY_GCC_VERSION_CHECK(4, 5, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ + HEDLEY_ARM_VERSION_CHECK(5, 6, 0) || \ + HEDLEY_SUNPRO_VERSION_CHECK(5, 13, 0) || \ + HEDLEY_PGI_VERSION_CHECK(17, 10, 0) || \ + HEDLEY_TI_VERSION_CHECK(8, 3, 0) +#define HEDLEY_DEPRECATED(since) \ + __attribute__((__deprecated__("Since " #since))) +#define HEDLEY_DEPRECATED_FOR(since, replacement) \ + __attribute__((__deprecated__("Since " #since "; use " #replacement))) +#elif HEDLEY_HAS_ATTRIBUTE(deprecated) || HEDLEY_GCC_VERSION_CHECK(3, 1, 0) || \ + HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ + HEDLEY_TI_VERSION_CHECK(8, 0, 0) || \ + (HEDLEY_TI_VERSION_CHECK(7, 3, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) +#define HEDLEY_DEPRECATED(since) __attribute__((__deprecated__)) +#define HEDLEY_DEPRECATED_FOR(since, replacement) \ + __attribute__((__deprecated__)) +#elif HEDLEY_MSVC_VERSION_CHECK(14, 0, 0) +#define HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " #since)) +#define HEDLEY_DEPRECATED_FOR(since, replacement) \ + __declspec(deprecated("Since " #since "; use " #replacement)) +#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) || \ + HEDLEY_PELLES_VERSION_CHECK(6, 50, 0) +#define HEDLEY_DEPRECATED(since) __declspec(deprecated) +#define HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated) +#elif HEDLEY_IAR_VERSION_CHECK(8, 0, 0) +#define HEDLEY_DEPRECATED(since) _Pragma("deprecated") +#define HEDLEY_DEPRECATED_FOR(since, replacement) _Pragma("deprecated") +#else +#define HEDLEY_DEPRECATED(since) +#define HEDLEY_DEPRECATED_FOR(since, replacement) +#endif + +#if defined(HEDLEY_UNAVAILABLE) +#undef HEDLEY_UNAVAILABLE +#endif +#if HEDLEY_HAS_ATTRIBUTE(warning) || HEDLEY_GCC_VERSION_CHECK(4, 3, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) +#define HEDLEY_UNAVAILABLE(available_since) \ + __attribute__((__warning__("Not available until " #available_since))) +#else +#define HEDLEY_UNAVAILABLE(available_since) +#endif + +#if defined(HEDLEY_WARN_UNUSED_RESULT) +#undef HEDLEY_WARN_UNUSED_RESULT +#endif +#if defined(__cplusplus) && (__cplusplus >= 201703L) +#define HEDLEY_WARN_UNUSED_RESULT [[nodiscard]] +#elif HEDLEY_HAS_ATTRIBUTE(warn_unused_result) || \ + HEDLEY_GCC_VERSION_CHECK(3, 4, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ + HEDLEY_TI_VERSION_CHECK(8, 0, 0) || \ + (HEDLEY_TI_VERSION_CHECK(7, 3, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + (HEDLEY_SUNPRO_VERSION_CHECK(5, 15, 0) && defined(__cplusplus)) || \ + HEDLEY_PGI_VERSION_CHECK(17, 10, 0) +#define HEDLEY_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__)) +#elif defined(_Check_return_) /* SAL */ +#define HEDLEY_WARN_UNUSED_RESULT _Check_return_ +#else +#define HEDLEY_WARN_UNUSED_RESULT +#endif + +#if defined(HEDLEY_SENTINEL) +#undef HEDLEY_SENTINEL +#endif +#if HEDLEY_HAS_ATTRIBUTE(sentinel) || HEDLEY_GCC_VERSION_CHECK(4, 0, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ + HEDLEY_ARM_VERSION_CHECK(5, 4, 0) +#define HEDLEY_SENTINEL(position) __attribute__((__sentinel__(position))) +#else +#define HEDLEY_SENTINEL(position) +#endif + +#if defined(HEDLEY_NO_RETURN) +#undef HEDLEY_NO_RETURN +#endif +#if HEDLEY_IAR_VERSION_CHECK(8, 0, 0) +#define HEDLEY_NO_RETURN __noreturn +#elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) +#define HEDLEY_NO_RETURN __attribute__((__noreturn__)) +#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L +#define HEDLEY_NO_RETURN _Noreturn +#elif defined(__cplusplus) && (__cplusplus >= 201103L) +#define HEDLEY_NO_RETURN [[noreturn]] +#elif HEDLEY_HAS_ATTRIBUTE(noreturn) || HEDLEY_GCC_VERSION_CHECK(3, 2, 0) || \ + HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) || \ + HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ + HEDLEY_IBM_VERSION_CHECK(10, 1, 0) || \ + HEDLEY_TI_VERSION_CHECK(18, 0, 0) || \ + (HEDLEY_TI_VERSION_CHECK(17, 3, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) +#define HEDLEY_NO_RETURN __attribute__((__noreturn__)) +#elif HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0) +#define HEDLEY_NO_RETURN _Pragma("does_not_return") +#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) +#define HEDLEY_NO_RETURN __declspec(noreturn) +#elif HEDLEY_TI_VERSION_CHECK(6, 0, 0) && defined(__cplusplus) +#define HEDLEY_NO_RETURN _Pragma("FUNC_NEVER_RETURNS;") +#elif HEDLEY_COMPCERT_VERSION_CHECK(3, 2, 0) +#define HEDLEY_NO_RETURN __attribute((noreturn)) +#elif HEDLEY_PELLES_VERSION_CHECK(9, 0, 0) +#define HEDLEY_NO_RETURN __declspec(noreturn) +#else +#define HEDLEY_NO_RETURN +#endif + +#if defined(HEDLEY_UNREACHABLE) +#undef HEDLEY_UNREACHABLE +#endif +#if defined(HEDLEY_UNREACHABLE_RETURN) +#undef HEDLEY_UNREACHABLE_RETURN +#endif +#if (HEDLEY_HAS_BUILTIN(__builtin_unreachable) && \ + (!defined(HEDLEY_ARM_VERSION))) || \ + HEDLEY_GCC_VERSION_CHECK(4, 5, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ + HEDLEY_IBM_VERSION_CHECK(13, 1, 5) +#define HEDLEY_UNREACHABLE() __builtin_unreachable() +#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) +#define HEDLEY_UNREACHABLE() __assume(0) +#elif HEDLEY_TI_VERSION_CHECK(6, 0, 0) +#if defined(__cplusplus) +#define HEDLEY_UNREACHABLE() std::_nassert(0) +#else +#define HEDLEY_UNREACHABLE() _nassert(0) +#endif +#define HEDLEY_UNREACHABLE_RETURN(value) return value +#elif defined(EXIT_FAILURE) +#define HEDLEY_UNREACHABLE() abort() +#else +#define HEDLEY_UNREACHABLE() +#define HEDLEY_UNREACHABLE_RETURN(value) return value +#endif +#if !defined(HEDLEY_UNREACHABLE_RETURN) +#define HEDLEY_UNREACHABLE_RETURN(value) HEDLEY_UNREACHABLE() +#endif + +#if defined(HEDLEY_ASSUME) +#undef HEDLEY_ASSUME +#endif +#if HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) || HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) +#define HEDLEY_ASSUME(expr) __assume(expr) +#elif HEDLEY_HAS_BUILTIN(__builtin_assume) +#define HEDLEY_ASSUME(expr) __builtin_assume(expr) +#elif HEDLEY_TI_VERSION_CHECK(6, 0, 0) +#if defined(__cplusplus) +#define HEDLEY_ASSUME(expr) std::_nassert(expr) +#else +#define HEDLEY_ASSUME(expr) _nassert(expr) +#endif +#elif (HEDLEY_HAS_BUILTIN(__builtin_unreachable) && \ + !defined(HEDLEY_ARM_VERSION)) || \ + HEDLEY_GCC_VERSION_CHECK(4, 5, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ + HEDLEY_IBM_VERSION_CHECK(13, 1, 5) +#define HEDLEY_ASSUME(expr) ((void)((expr) ? 1 : (__builtin_unreachable(), 1))) +#else +#define HEDLEY_ASSUME(expr) ((void)(expr)) +#endif + +HEDLEY_DIAGNOSTIC_PUSH +#if HEDLEY_HAS_WARNING("-Wvariadic-macros") || HEDLEY_GCC_VERSION_CHECK(4, 0, 0) +#if defined(__clang__) +#pragma clang diagnostic ignored "-Wvariadic-macros" +#elif defined(HEDLEY_GCC_VERSION) +#pragma GCC diagnostic ignored "-Wvariadic-macros" +#endif +#endif +#if defined(HEDLEY_NON_NULL) +#undef HEDLEY_NON_NULL +#endif +#if HEDLEY_HAS_ATTRIBUTE(nonnull) || HEDLEY_GCC_VERSION_CHECK(3, 3, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ + HEDLEY_ARM_VERSION_CHECK(4, 1, 0) +#define HEDLEY_NON_NULL(...) __attribute__((__nonnull__(__VA_ARGS__))) +#else +#define HEDLEY_NON_NULL(...) +#endif +HEDLEY_DIAGNOSTIC_POP + +#if defined(HEDLEY_PRINTF_FORMAT) +#undef HEDLEY_PRINTF_FORMAT +#endif +#if defined(__MINGW32__) && HEDLEY_GCC_HAS_ATTRIBUTE(format, 4, 4, 0) && \ + !defined(__USE_MINGW_ANSI_STDIO) +#define HEDLEY_PRINTF_FORMAT(string_idx, first_to_check) \ + __attribute__((__format__(ms_printf, string_idx, first_to_check))) +#elif defined(__MINGW32__) && HEDLEY_GCC_HAS_ATTRIBUTE(format, 4, 4, 0) && \ + defined(__USE_MINGW_ANSI_STDIO) +#define HEDLEY_PRINTF_FORMAT(string_idx, first_to_check) \ + __attribute__((__format__(gnu_printf, string_idx, first_to_check))) +#elif HEDLEY_HAS_ATTRIBUTE(format) || HEDLEY_GCC_VERSION_CHECK(3, 1, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ + HEDLEY_ARM_VERSION_CHECK(5, 6, 0) || \ + HEDLEY_IBM_VERSION_CHECK(10, 1, 0) || \ + HEDLEY_TI_VERSION_CHECK(8, 0, 0) || \ + (HEDLEY_TI_VERSION_CHECK(7, 3, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) +#define HEDLEY_PRINTF_FORMAT(string_idx, first_to_check) \ + __attribute__((__format__(__printf__, string_idx, first_to_check))) +#elif HEDLEY_PELLES_VERSION_CHECK(6, 0, 0) +#define HEDLEY_PRINTF_FORMAT(string_idx, first_to_check) \ + __declspec(vaformat(printf, string_idx, first_to_check)) +#else +#define HEDLEY_PRINTF_FORMAT(string_idx, first_to_check) +#endif + +#if defined(HEDLEY_CONSTEXPR) +#undef HEDLEY_CONSTEXPR +#endif +#if defined(__cplusplus) +#if __cplusplus >= 201103L +#define HEDLEY_CONSTEXPR constexpr +#endif +#endif +#if !defined(HEDLEY_CONSTEXPR) +#define HEDLEY_CONSTEXPR +#endif + +#if defined(HEDLEY_PREDICT) +#undef HEDLEY_PREDICT +#endif +#if defined(HEDLEY_LIKELY) +#undef HEDLEY_LIKELY +#endif +#if defined(HEDLEY_UNLIKELY) +#undef HEDLEY_UNLIKELY +#endif +#if defined(HEDLEY_UNPREDICTABLE) +#undef HEDLEY_UNPREDICTABLE +#endif +#if HEDLEY_HAS_BUILTIN(__builtin_unpredictable) +#define HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable(!!(expr)) +#endif +#if HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) || \ + HEDLEY_GCC_VERSION_CHECK(9, 0, 0) +#define HEDLEY_PREDICT(expr, value, probability) \ + __builtin_expect_with_probability(expr, value, probability) +#define HEDLEY_PREDICT_TRUE(expr, probability) \ + __builtin_expect_with_probability(!!(expr), 1, probability) +#define HEDLEY_PREDICT_FALSE(expr, probability) \ + __builtin_expect_with_probability(!!(expr), 0, probability) +#define HEDLEY_LIKELY(expr) __builtin_expect(!!(expr), 1) +#define HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0) +#if !defined(HEDLEY_BUILTIN_UNPREDICTABLE) +#define HEDLEY_BUILTIN_UNPREDICTABLE(expr) \ + __builtin_expect_with_probability(!!(expr), 1, 0.5) +#endif +#elif HEDLEY_HAS_BUILTIN(__builtin_expect) || \ + HEDLEY_GCC_VERSION_CHECK(3, 0, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ + (HEDLEY_SUNPRO_VERSION_CHECK(5, 15, 0) && defined(__cplusplus)) || \ + HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ + HEDLEY_IBM_VERSION_CHECK(10, 1, 0) || \ + HEDLEY_TI_VERSION_CHECK(6, 1, 0) || \ + HEDLEY_TINYC_VERSION_CHECK(0, 9, 27) +#define HEDLEY_PREDICT(expr, expected, probability) \ + (((probability) >= 0.9) ? __builtin_expect(!!(expr), (expected)) \ + : (((void)(expected)), !!(expr))) +#define HEDLEY_PREDICT_TRUE(expr, probability) \ + (__extension__({ \ + HEDLEY_CONSTEXPR double hedley_probability_ = (probability); \ + ((hedley_probability_ >= 0.9) \ + ? __builtin_expect(!!(expr), 1) \ + : ((hedley_probability_ <= 0.1) \ + ? __builtin_expect(!!(expr), 0) \ + : !!(expr))); \ + })) +#define HEDLEY_PREDICT_FALSE(expr, probability) \ + (__extension__({ \ + HEDLEY_CONSTEXPR double hedley_probability_ = (probability); \ + ((hedley_probability_ >= 0.9) \ + ? __builtin_expect(!!(expr), 0) \ + : ((hedley_probability_ <= 0.1) \ + ? __builtin_expect(!!(expr), 1) \ + : !!(expr))); \ + })) +#define HEDLEY_LIKELY(expr) __builtin_expect(!!(expr), 1) +#define HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0) +#else +#define HEDLEY_PREDICT(expr, expected, probability) \ + (((void)(expected)), !!(expr)) +#define HEDLEY_PREDICT_TRUE(expr, probability) (!!(expr)) +#define HEDLEY_PREDICT_FALSE(expr, probability) (!!(expr)) +#define HEDLEY_LIKELY(expr) (!!(expr)) +#define HEDLEY_UNLIKELY(expr) (!!(expr)) +#endif +#if !defined(HEDLEY_UNPREDICTABLE) +#define HEDLEY_UNPREDICTABLE(expr) HEDLEY_PREDICT(expr, 1, 0.5) +#endif + +#if defined(HEDLEY_MALLOC) +#undef HEDLEY_MALLOC +#endif +#if HEDLEY_HAS_ATTRIBUTE(malloc) || HEDLEY_GCC_VERSION_CHECK(3, 1, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ + HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) || \ + HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ + HEDLEY_IBM_VERSION_CHECK(12, 1, 0) || \ + HEDLEY_TI_VERSION_CHECK(8, 0, 0) || \ + (HEDLEY_TI_VERSION_CHECK(7, 3, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) +#define HEDLEY_MALLOC __attribute__((__malloc__)) +#elif HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0) +#define HEDLEY_MALLOC _Pragma("returns_new_memory") +#elif HEDLEY_MSVC_VERSION_CHECK(14, 0, 0) +#define HEDLEY_MALLOC __declspec(restrict) +#else +#define HEDLEY_MALLOC +#endif + +#if defined(HEDLEY_PURE) +#undef HEDLEY_PURE +#endif +#if HEDLEY_HAS_ATTRIBUTE(pure) || HEDLEY_GCC_VERSION_CHECK(2, 96, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ + HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) || \ + HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ + HEDLEY_IBM_VERSION_CHECK(10, 1, 0) || \ + HEDLEY_TI_VERSION_CHECK(8, 0, 0) || \ + (HEDLEY_TI_VERSION_CHECK(7, 3, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_PGI_VERSION_CHECK(17, 10, 0) +#define HEDLEY_PURE __attribute__((__pure__)) +#elif HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0) +#define HEDLEY_PURE _Pragma("does_not_write_global_data") +#elif HEDLEY_TI_VERSION_CHECK(6, 0, 0) && defined(__cplusplus) +#define HEDLEY_PURE _Pragma("FUNC_IS_PURE;") +#else +#define HEDLEY_PURE +#endif + +#if defined(HEDLEY_CONST) +#undef HEDLEY_CONST +#endif +#if HEDLEY_HAS_ATTRIBUTE(const) || HEDLEY_GCC_VERSION_CHECK(2, 5, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ + HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) || \ + HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ + HEDLEY_IBM_VERSION_CHECK(10, 1, 0) || \ + HEDLEY_TI_VERSION_CHECK(8, 0, 0) || \ + (HEDLEY_TI_VERSION_CHECK(7, 3, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_PGI_VERSION_CHECK(17, 10, 0) +#define HEDLEY_CONST __attribute__((__const__)) +#elif HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0) +#define HEDLEY_CONST _Pragma("no_side_effect") +#else +#define HEDLEY_CONST HEDLEY_PURE +#endif + +#if defined(HEDLEY_RESTRICT) +#undef HEDLEY_RESTRICT +#endif +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \ + !defined(__cplusplus) +#define HEDLEY_RESTRICT restrict +#elif HEDLEY_GCC_VERSION_CHECK(3, 1, 0) || \ + HEDLEY_MSVC_VERSION_CHECK(14, 0, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ + HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ + HEDLEY_IBM_VERSION_CHECK(10, 1, 0) || \ + HEDLEY_PGI_VERSION_CHECK(17, 10, 0) || \ + HEDLEY_TI_VERSION_CHECK(8, 0, 0) || \ + (HEDLEY_SUNPRO_VERSION_CHECK(5, 14, 0) && defined(__cplusplus)) || \ + HEDLEY_IAR_VERSION_CHECK(8, 0, 0) || defined(__clang__) +#define HEDLEY_RESTRICT __restrict +#elif HEDLEY_SUNPRO_VERSION_CHECK(5, 3, 0) && !defined(__cplusplus) +#define HEDLEY_RESTRICT _Restrict +#else +#define HEDLEY_RESTRICT +#endif + +#if defined(HEDLEY_INLINE) +#undef HEDLEY_INLINE +#endif +#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \ + (defined(__cplusplus) && (__cplusplus >= 199711L)) +#define HEDLEY_INLINE inline +#elif defined(HEDLEY_GCC_VERSION) || HEDLEY_ARM_VERSION_CHECK(6, 2, 0) +#define HEDLEY_INLINE __inline__ +#elif HEDLEY_MSVC_VERSION_CHECK(12, 0, 0) || \ + HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || HEDLEY_TI_VERSION_CHECK(8, 0, 0) +#define HEDLEY_INLINE __inline +#else +#define HEDLEY_INLINE +#endif + +#if defined(HEDLEY_ALWAYS_INLINE) +#undef HEDLEY_ALWAYS_INLINE +#endif +#if HEDLEY_HAS_ATTRIBUTE(always_inline) || \ + HEDLEY_GCC_VERSION_CHECK(4, 0, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ + HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) || \ + HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ + HEDLEY_IBM_VERSION_CHECK(10, 1, 0) || \ + HEDLEY_TI_VERSION_CHECK(8, 0, 0) || \ + (HEDLEY_TI_VERSION_CHECK(7, 3, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) +#define HEDLEY_ALWAYS_INLINE __attribute__((__always_inline__)) HEDLEY_INLINE +#elif HEDLEY_MSVC_VERSION_CHECK(12, 0, 0) +#define HEDLEY_ALWAYS_INLINE __forceinline +#elif HEDLEY_TI_VERSION_CHECK(7, 0, 0) && defined(__cplusplus) +#define HEDLEY_ALWAYS_INLINE _Pragma("FUNC_ALWAYS_INLINE;") +#elif HEDLEY_IAR_VERSION_CHECK(8, 0, 0) +#define HEDLEY_ALWAYS_INLINE _Pragma("inline=forced") +#else +#define HEDLEY_ALWAYS_INLINE HEDLEY_INLINE +#endif + +#if defined(HEDLEY_NEVER_INLINE) +#undef HEDLEY_NEVER_INLINE +#endif +#if HEDLEY_HAS_ATTRIBUTE(noinline) || HEDLEY_GCC_VERSION_CHECK(4, 0, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ + HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) || \ + HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ + HEDLEY_IBM_VERSION_CHECK(10, 1, 0) || \ + HEDLEY_TI_VERSION_CHECK(8, 0, 0) || \ + (HEDLEY_TI_VERSION_CHECK(7, 3, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) +#define HEDLEY_NEVER_INLINE __attribute__((__noinline__)) +#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) +#define HEDLEY_NEVER_INLINE __declspec(noinline) +#elif HEDLEY_PGI_VERSION_CHECK(10, 2, 0) +#define HEDLEY_NEVER_INLINE _Pragma("noinline") +#elif HEDLEY_TI_VERSION_CHECK(6, 0, 0) && defined(__cplusplus) +#define HEDLEY_NEVER_INLINE _Pragma("FUNC_CANNOT_INLINE;") +#elif HEDLEY_IAR_VERSION_CHECK(8, 0, 0) +#define HEDLEY_NEVER_INLINE _Pragma("inline=never") +#elif HEDLEY_COMPCERT_VERSION_CHECK(3, 2, 0) +#define HEDLEY_NEVER_INLINE __attribute((noinline)) +#elif HEDLEY_PELLES_VERSION_CHECK(9, 0, 0) +#define HEDLEY_NEVER_INLINE __declspec(noinline) +#else +#define HEDLEY_NEVER_INLINE +#endif + +#if defined(HEDLEY_PRIVATE) +#undef HEDLEY_PRIVATE +#endif +#if defined(HEDLEY_PUBLIC) +#undef HEDLEY_PUBLIC +#endif +#if defined(HEDLEY_IMPORT) +#undef HEDLEY_IMPORT +#endif +#if defined(_WIN32) || defined(__CYGWIN__) +#define HEDLEY_PRIVATE +#define HEDLEY_PUBLIC __declspec(dllexport) +#define HEDLEY_IMPORT __declspec(dllimport) +#else +#if HEDLEY_HAS_ATTRIBUTE(visibility) || HEDLEY_GCC_VERSION_CHECK(3, 3, 0) || \ + HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ + HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ + HEDLEY_IBM_VERSION_CHECK(13, 1, 0) || \ + HEDLEY_TI_VERSION_CHECK(8, 0, 0) || \ + (HEDLEY_TI_VERSION_CHECK(7, 3, 0) && defined(__TI_EABI__) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) +#define HEDLEY_PRIVATE __attribute__((__visibility__("hidden"))) +#define HEDLEY_PUBLIC __attribute__((__visibility__("default"))) +#else +#define HEDLEY_PRIVATE +#define HEDLEY_PUBLIC +#endif +#define HEDLEY_IMPORT extern +#endif + +#if defined(HEDLEY_NO_THROW) +#undef HEDLEY_NO_THROW +#endif +#if HEDLEY_HAS_ATTRIBUTE(nothrow) || HEDLEY_GCC_VERSION_CHECK(3, 3, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) +#define HEDLEY_NO_THROW __attribute__((__nothrow__)) +#elif HEDLEY_MSVC_VERSION_CHECK(13, 1, 0) || HEDLEY_ARM_VERSION_CHECK(4, 1, 0) +#define HEDLEY_NO_THROW __declspec(nothrow) +#else +#define HEDLEY_NO_THROW +#endif + +#if defined(HEDLEY_FALL_THROUGH) +#undef HEDLEY_FALL_THROUGH +#endif +#if defined(__cplusplus) && \ + (!defined(HEDLEY_SUNPRO_VERSION) || \ + HEDLEY_SUNPRO_VERSION_CHECK(5, 15, 0)) && \ + !defined(HEDLEY_PGI_VERSION) +#if (__cplusplus >= 201703L) || \ + ((__cplusplus >= 201103L) && HEDLEY_HAS_CPP_ATTRIBUTE(fallthrough)) +#define HEDLEY_FALL_THROUGH [[fallthrough]] +#elif (__cplusplus >= 201103L) && HEDLEY_HAS_CPP_ATTRIBUTE(clang::fallthrough) +#define HEDLEY_FALL_THROUGH [[clang::fallthrough]] +#elif (__cplusplus >= 201103L) && HEDLEY_GCC_VERSION_CHECK(7, 0, 0) +#define HEDLEY_FALL_THROUGH [[gnu::fallthrough]] +#endif +#endif +#if !defined(HEDLEY_FALL_THROUGH) +#if HEDLEY_GNUC_HAS_ATTRIBUTE(fallthrough, 7, 0, 0) && \ + !defined(HEDLEY_PGI_VERSION) +#define HEDLEY_FALL_THROUGH __attribute__((__fallthrough__)) +#elif defined(__fallthrough) /* SAL */ +#define HEDLEY_FALL_THROUGH __fallthrough +#else +#define HEDLEY_FALL_THROUGH +#endif +#endif + +#if defined(HEDLEY_RETURNS_NON_NULL) +#undef HEDLEY_RETURNS_NON_NULL +#endif +#if HEDLEY_HAS_ATTRIBUTE(returns_nonnull) || HEDLEY_GCC_VERSION_CHECK(4, 9, 0) +#define HEDLEY_RETURNS_NON_NULL __attribute__((__returns_nonnull__)) +#elif defined(_Ret_notnull_) /* SAL */ +#define HEDLEY_RETURNS_NON_NULL _Ret_notnull_ +#else +#define HEDLEY_RETURNS_NON_NULL +#endif + +#if defined(HEDLEY_ARRAY_PARAM) +#undef HEDLEY_ARRAY_PARAM +#endif +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \ + !defined(__STDC_NO_VLA__) && !defined(__cplusplus) && \ + !defined(HEDLEY_PGI_VERSION) && !defined(HEDLEY_TINYC_VERSION) +#define HEDLEY_ARRAY_PARAM(name) (name) +#else +#define HEDLEY_ARRAY_PARAM(name) +#endif + +#if defined(HEDLEY_IS_CONSTANT) +#undef HEDLEY_IS_CONSTANT +#endif +#if defined(HEDLEY_REQUIRE_CONSTEXPR) +#undef HEDLEY_REQUIRE_CONSTEXPR +#endif +/* Note the double-underscore. For internal use only; no API + * guarantees! */ +#if defined(HEDLEY__IS_CONSTEXPR) +#undef HEDLEY__IS_CONSTEXPR +#endif + +#if HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \ + HEDLEY_GCC_VERSION_CHECK(3, 4, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ + HEDLEY_TINYC_VERSION_CHECK(0, 9, 19) || \ + HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ + HEDLEY_IBM_VERSION_CHECK(13, 1, 0) || \ + HEDLEY_TI_VERSION_CHECK(6, 1, 0) || \ + (HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0) && !defined(__cplusplus)) || \ + HEDLEY_CRAY_VERSION_CHECK(8, 1, 0) +#define HEDLEY_IS_CONSTANT(expr) __builtin_constant_p(expr) +#endif +#if !defined(__cplusplus) +#if HEDLEY_HAS_BUILTIN(__builtin_types_compatible_p) || \ + HEDLEY_GCC_VERSION_CHECK(3, 4, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ + HEDLEY_IBM_VERSION_CHECK(13, 1, 0) || \ + HEDLEY_CRAY_VERSION_CHECK(8, 1, 0) || \ + HEDLEY_ARM_VERSION_CHECK(5, 4, 0) || \ + HEDLEY_TINYC_VERSION_CHECK(0, 9, 24) +#if defined(__INTPTR_TYPE__) +#define HEDLEY__IS_CONSTEXPR(expr) \ + __builtin_types_compatible_p( \ + __typeof__((1 ? (void *)((__INTPTR_TYPE__)((expr)*0)) \ + : (int *)0)), \ + int *) +#else +#include +#define HEDLEY__IS_CONSTEXPR(expr) \ + __builtin_types_compatible_p( \ + __typeof__((1 ? (void *)((intptr_t)((expr)*0)) : (int *)0)), \ + int *) +#endif +#elif (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \ + !defined(HEDLEY_SUNPRO_VERSION) && !defined(HEDLEY_PGI_VERSION)) || \ + HEDLEY_HAS_EXTENSION(c_generic_selections) || \ + HEDLEY_GCC_VERSION_CHECK(4, 9, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(17, 0, 0) || \ + HEDLEY_IBM_VERSION_CHECK(12, 1, 0) || \ + HEDLEY_ARM_VERSION_CHECK(5, 3, 0) +#if defined(__INTPTR_TYPE__) +#define HEDLEY__IS_CONSTEXPR(expr) \ + _Generic((1 ? (void *)((__INTPTR_TYPE__)((expr)*0)) : (int *)0), \ + int * : 1, void * : 0) +#else +#include +#define HEDLEY__IS_CONSTEXPR(expr) \ + _Generic((1 ? (void *)((intptr_t)*0) : (int *)0), int * : 1, void * : 0) +#endif +#elif defined(HEDLEY_GCC_VERSION) || defined(HEDLEY_INTEL_VERSION) || \ + defined(HEDLEY_TINYC_VERSION) || defined(HEDLEY_TI_VERSION) || \ + defined(__clang__) +#define HEDLEY__IS_CONSTEXPR(expr) \ + (sizeof(void) != sizeof(*(1 ? ((void *)((expr)*0L)) : ((struct { \ + char v[sizeof(void) * 2]; \ + } *)1)))) +#endif +#endif +#if defined(HEDLEY__IS_CONSTEXPR) +#if !defined(HEDLEY_IS_CONSTANT) +#define HEDLEY_IS_CONSTANT(expr) HEDLEY__IS_CONSTEXPR(expr) +#endif +#define HEDLEY_REQUIRE_CONSTEXPR(expr) \ + (HEDLEY__IS_CONSTEXPR(expr) ? (expr) : (-1)) +#else +#if !defined(HEDLEY_IS_CONSTANT) +#define HEDLEY_IS_CONSTANT(expr) (0) +#endif +#define HEDLEY_REQUIRE_CONSTEXPR(expr) (expr) +#endif + +#if defined(HEDLEY_BEGIN_C_DECLS) +#undef HEDLEY_BEGIN_C_DECLS +#endif +#if defined(HEDLEY_END_C_DECLS) +#undef HEDLEY_END_C_DECLS +#endif +#if defined(HEDLEY_C_DECL) +#undef HEDLEY_C_DECL +#endif +#if defined(__cplusplus) +#define HEDLEY_BEGIN_C_DECLS extern "C" { +#define HEDLEY_END_C_DECLS } +#define HEDLEY_C_DECL extern "C" +#else +#define HEDLEY_BEGIN_C_DECLS +#define HEDLEY_END_C_DECLS +#define HEDLEY_C_DECL +#endif + +#if defined(HEDLEY_STATIC_ASSERT) +#undef HEDLEY_STATIC_ASSERT +#endif +#if !defined(__cplusplus) && \ + ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \ + HEDLEY_HAS_FEATURE(c_static_assert) || \ + HEDLEY_GCC_VERSION_CHECK(6, 0, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || defined(_Static_assert)) +#define HEDLEY_STATIC_ASSERT(expr, message) _Static_assert(expr, message) +#elif (defined(__cplusplus) && (__cplusplus >= 201103L)) || \ + HEDLEY_MSVC_VERSION_CHECK(16, 0, 0) || \ + (defined(__cplusplus) && HEDLEY_TI_VERSION_CHECK(8, 3, 0)) +#define HEDLEY_STATIC_ASSERT(expr, message) static_assert(expr, message) +#else +#define HEDLEY_STATIC_ASSERT(expr, message) +#endif + +#if defined(HEDLEY_CONST_CAST) +#undef HEDLEY_CONST_CAST +#endif +#if defined(__cplusplus) +#define HEDLEY_CONST_CAST(T, expr) (const_cast(expr)) +#elif HEDLEY_HAS_WARNING("-Wcast-qual") || \ + HEDLEY_GCC_VERSION_CHECK(4, 6, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) +#define HEDLEY_CONST_CAST(T, expr) \ + (__extension__({ \ + HEDLEY_DIAGNOSTIC_PUSH \ + HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL((T)(expr)); \ + HEDLEY_DIAGNOSTIC_POP \ + })) +#else +#define HEDLEY_CONST_CAST(T, expr) ((T)(expr)) +#endif + +#if defined(HEDLEY_REINTERPRET_CAST) +#undef HEDLEY_REINTERPRET_CAST +#endif +#if defined(__cplusplus) +#define HEDLEY_REINTERPRET_CAST(T, expr) (reinterpret_cast(expr)) +#else +#define HEDLEY_REINTERPRET_CAST(T, expr) (*((T *)&(expr))) +#endif + +#if defined(HEDLEY_STATIC_CAST) +#undef HEDLEY_STATIC_CAST +#endif +#if defined(__cplusplus) +#define HEDLEY_STATIC_CAST(T, expr) (static_cast(expr)) +#else +#define HEDLEY_STATIC_CAST(T, expr) ((T)(expr)) +#endif + +#if defined(HEDLEY_CPP_CAST) +#undef HEDLEY_CPP_CAST +#endif +#if defined(__cplusplus) +#define HEDLEY_CPP_CAST(T, expr) static_cast(expr) +#else +#define HEDLEY_CPP_CAST(T, expr) (expr) +#endif + +#if defined(HEDLEY_MESSAGE) +#undef HEDLEY_MESSAGE +#endif +#if HEDLEY_HAS_WARNING("-Wunknown-pragmas") +#define HEDLEY_MESSAGE(msg) \ + HEDLEY_DIAGNOSTIC_PUSH \ + HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \ + HEDLEY_PRAGMA(message msg) \ + HEDLEY_DIAGNOSTIC_POP +#elif HEDLEY_GCC_VERSION_CHECK(4, 4, 0) || HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) +#define HEDLEY_MESSAGE(msg) HEDLEY_PRAGMA(message msg) +#elif HEDLEY_CRAY_VERSION_CHECK(5, 0, 0) +#define HEDLEY_MESSAGE(msg) HEDLEY_PRAGMA(_CRI message msg) +#elif HEDLEY_IAR_VERSION_CHECK(8, 0, 0) +#define HEDLEY_MESSAGE(msg) HEDLEY_PRAGMA(message(msg)) +#elif HEDLEY_PELLES_VERSION_CHECK(2, 0, 0) +#define HEDLEY_MESSAGE(msg) HEDLEY_PRAGMA(message(msg)) +#else +#define HEDLEY_MESSAGE(msg) +#endif + +#if defined(HEDLEY_WARNING) +#undef HEDLEY_WARNING +#endif +#if HEDLEY_HAS_WARNING("-Wunknown-pragmas") +#define HEDLEY_WARNING(msg) \ + HEDLEY_DIAGNOSTIC_PUSH \ + HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \ + HEDLEY_PRAGMA(clang warning msg) \ + HEDLEY_DIAGNOSTIC_POP +#elif HEDLEY_GCC_VERSION_CHECK(4, 8, 0) || HEDLEY_PGI_VERSION_CHECK(18, 4, 0) +#define HEDLEY_WARNING(msg) HEDLEY_PRAGMA(GCC warning msg) +#elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0) +#define HEDLEY_WARNING(msg) HEDLEY_PRAGMA(message(msg)) +#else +#define HEDLEY_WARNING(msg) HEDLEY_MESSAGE(msg) +#endif + +#if defined(HEDLEY_REQUIRE_MSG) +#undef HEDLEY_REQUIRE_MSG +#endif +#if HEDLEY_HAS_ATTRIBUTE(diagnose_if) +#if HEDLEY_HAS_WARNING("-Wgcc-compat") +#define HEDLEY_REQUIRE_MSG(expr, msg) \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \ + __attribute__((__diagnose_if__(!(expr), msg, "error"))) \ + HEDLEY_DIAGNOSTIC_POP +#else +#define HEDLEY_REQUIRE_MSG(expr, msg) \ + __attribute__((__diagnose_if__(!(expr), msg, "error"))) +#endif +#else +#define HEDLEY_REQUIRE_MSG(expr, msg) +#endif + +#if defined(HEDLEY_REQUIRE) +#undef HEDLEY_REQUIRE +#endif +#define HEDLEY_REQUIRE(expr) HEDLEY_REQUIRE_MSG(expr, #expr) + +#if defined(HEDLEY_FLAGS) +#undef HEDLEY_FLAGS +#endif +#if HEDLEY_HAS_ATTRIBUTE(flag_enum) +#define HEDLEY_FLAGS __attribute__((__flag_enum__)) +#endif + +#if defined(HEDLEY_FLAGS_CAST) +#undef HEDLEY_FLAGS_CAST +#endif +#if HEDLEY_INTEL_VERSION_CHECK(19, 0, 0) +#define HEDLEY_FLAGS_CAST(T, expr) \ + (__extension__({ \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("warning(disable:188)")((T)(expr)); \ + HEDLEY_DIAGNOSTIC_POP \ + })) +#else +#define HEDLEY_FLAGS_CAST(T, expr) HEDLEY_STATIC_CAST(T, expr) +#endif + +#if defined(HEDLEY_EMPTY_BASES) +#undef HEDLEY_EMPTY_BASES +#endif +#if HEDLEY_MSVC_VERSION_CHECK(19, 0, 23918) && \ + !HEDLEY_MSVC_VERSION_CHECK(20, 0, 0) +#define HEDLEY_EMPTY_BASES __declspec(empty_bases) +#else +#define HEDLEY_EMPTY_BASES +#endif + +/* Remaining macros are deprecated. */ + +#if defined(HEDLEY_GCC_NOT_CLANG_VERSION_CHECK) +#undef HEDLEY_GCC_NOT_CLANG_VERSION_CHECK +#endif +#if defined(__clang__) +#define HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major, minor, patch) (0) +#else +#define HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major, minor, patch) \ + HEDLEY_GCC_VERSION_CHECK(major, minor, patch) +#endif + +#if defined(HEDLEY_CLANG_HAS_ATTRIBUTE) +#undef HEDLEY_CLANG_HAS_ATTRIBUTE +#endif +#define HEDLEY_CLANG_HAS_ATTRIBUTE(attribute) HEDLEY_HAS_ATTRIBUTE(attribute) + +#if defined(HEDLEY_CLANG_HAS_CPP_ATTRIBUTE) +#undef HEDLEY_CLANG_HAS_CPP_ATTRIBUTE +#endif +#define HEDLEY_CLANG_HAS_CPP_ATTRIBUTE(attribute) \ + HEDLEY_HAS_CPP_ATTRIBUTE(attribute) + +#if defined(HEDLEY_CLANG_HAS_BUILTIN) +#undef HEDLEY_CLANG_HAS_BUILTIN +#endif +#define HEDLEY_CLANG_HAS_BUILTIN(builtin) HEDLEY_HAS_BUILTIN(builtin) + +#if defined(HEDLEY_CLANG_HAS_FEATURE) +#undef HEDLEY_CLANG_HAS_FEATURE +#endif +#define HEDLEY_CLANG_HAS_FEATURE(feature) HEDLEY_HAS_FEATURE(feature) + +#if defined(HEDLEY_CLANG_HAS_EXTENSION) +#undef HEDLEY_CLANG_HAS_EXTENSION +#endif +#define HEDLEY_CLANG_HAS_EXTENSION(extension) HEDLEY_HAS_EXTENSION(extension) + +#if defined(HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE) +#undef HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE +#endif +#define HEDLEY_CLANG_HAS_DECLSPEC_ATTRIBUTE(attribute) \ + HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) + +#if defined(HEDLEY_CLANG_HAS_WARNING) +#undef HEDLEY_CLANG_HAS_WARNING +#endif +#define HEDLEY_CLANG_HAS_WARNING(warning) HEDLEY_HAS_WARNING(warning) + +#endif /* !defined(HEDLEY_VERSION) || (HEDLEY_VERSION < X) */ diff --git a/libobs/util/aarch/mmx.h b/libobs/util/aarch/mmx.h new file mode 100644 index 000000000..fd38acbcd --- /dev/null +++ b/libobs/util/aarch/mmx.h @@ -0,0 +1,1356 @@ +/* Copyright (c) 2017-2018 Evan Nemerson + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(SIMDE__MMX_H) +#if !defined(SIMDE__MMX_H) +#define SIMDE__MMX_H +#endif +#include "simde-common.h" + +#if defined(SIMDE_MMX_FORCE_NATIVE) +#define SIMDE_MMX_NATIVE +#elif defined(__MMX__) && !defined(SIMDE_MMX_NO_NATIVE) && \ + !defined(SIMDE_NO_NATIVE) +#define SIMDE_MMX_NATIVE +#elif defined(__ARM_NEON) && !defined(SIMDE_MMX_NO_NEON) && \ + !defined(SIMDE_NO_NEON) +#define SIMDE_MMX_NEON +#endif + +#if defined(SIMDE_MMX_NATIVE) +#include +#else +#if defined(SIMDE_MMX_NEON) +#include +#endif +#endif +#include +#include +#include +#include + +SIMDE__BEGIN_DECLS + +typedef union { +#if defined(SIMDE__ENABLE_GCC_VEC_EXT) + int8_t i8 __attribute__((__vector_size__(8), __may_alias__)); + int16_t i16 __attribute__((__vector_size__(8), __may_alias__)); + int32_t i32 __attribute__((__vector_size__(8), __may_alias__)); + int64_t i64 __attribute__((__vector_size__(8), __may_alias__)); + uint8_t u8 __attribute__((__vector_size__(8), __may_alias__)); + uint16_t u16 __attribute__((__vector_size__(8), __may_alias__)); + uint32_t u32 __attribute__((__vector_size__(8), __may_alias__)); + uint64_t u64 __attribute__((__vector_size__(8), __may_alias__)); + simde_float32 f32 __attribute__((__vector_size__(8), __may_alias__)); +#else + int8_t i8[8]; + int16_t i16[4]; + int32_t i32[2]; + int64_t i64[1]; + uint8_t u8[8]; + uint16_t u16[4]; + uint32_t u32[2]; + uint64_t u64[1]; + simde_float32 f32[2]; +#endif + +#if defined(SIMDE_MMX_NATIVE) + __m64 n; +#elif defined(SIMDE_MMX_NEON) + int8x8_t neon_i8; + int16x4_t neon_i16; + int32x2_t neon_i32; + int64x1_t neon_i64; + uint8x8_t neon_u8; + uint16x4_t neon_u16; + uint32x2_t neon_u32; + uint64x1_t neon_u64; + float32x2_t neon_f32; +#endif +} simde__m64; + +#if defined(SIMDE_MMX_NATIVE) +HEDLEY_STATIC_ASSERT(sizeof(__m64) == sizeof(simde__m64), + "__m64 size doesn't match simde__m64 size"); +SIMDE__FUNCTION_ATTRIBUTES simde__m64 SIMDE__M64_C(__m64 v) +{ + simde__m64 r; + r.n = v; + return r; +} +#elif defined(SIMDE_MMX_NEON) +#define SIMDE__M64_NEON_C(T, expr) \ + (simde__m64) { .neon_##T = (expr) } +#endif +HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64), "__m64 size incorrect"); + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_add_pi8(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_add_pi8(a.n, b.n)); +#else + simde__m64 r; + SIMDE__VECTORIZE + for (size_t i = 0; i < 8; i++) { + r.i8[i] = a.i8[i] + b.i8[i]; + } + return r; +#endif +} +#define simde_m_paddb(a, b) simde_mm_add_pi8(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_add_pi16(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_add_pi16(a.n, b.n)); +#else + simde__m64 r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) { + r.i16[i] = a.i16[i] + b.i16[i]; + } + return r; +#endif +} +#define simde_m_paddw(a, b) simde_mm_add_pi16(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_add_pi32(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_add_pi32(a.n, b.n)); +#else + simde__m64 r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (8 / sizeof(int32_t)); i++) { + r.i32[i] = a.i32[i] + b.i32[i]; + } + return r; +#endif +} +#define simde_m_paddd(a, b) simde_mm_add_pi32(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_adds_pi8(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_adds_pi8(a.n, b.n)); +#else + simde__m64 r; + SIMDE__VECTORIZE + for (int i = 0; i < 8; i++) { + if ((((b.i8[i]) > 0) && ((a.i8[i]) > (INT8_MAX - (b.i8[i]))))) { + r.i8[i] = INT8_MAX; + } else if ((((b.i8[i]) < 0) && + ((a.i8[i]) < (INT8_MIN - (b.i8[i]))))) { + r.i8[i] = INT8_MIN; + } else { + r.i8[i] = (a.i8[i]) + (b.i8[i]); + } + } + return r; +#endif +} +#define simde_m_paddsb(a, b) simde_mm_adds_pi8(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_adds_pu8(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_adds_pu8(a.n, b.n)); +#else + simde__m64 r; + SIMDE__VECTORIZE + for (size_t i = 0; i < 8; i++) { + const int32_t x = a.u8[i] + b.u8[i]; + if (x < 0) + r.u8[i] = 0; + else if (x > UINT8_MAX) + r.u8[i] = UINT8_MAX; + else + r.u8[i] = (uint8_t)x; + } + return r; +#endif +} +#define simde_m_paddusb(a, b) simde_mm_adds_pu8(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_adds_pi16(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_adds_pi16(a.n, b.n)); +#else + simde__m64 r; + SIMDE__VECTORIZE + for (int i = 0; i < 4; i++) { + if ((((b.i16[i]) > 0) && + ((a.i16[i]) > (INT16_MAX - (b.i16[i]))))) { + r.i16[i] = INT16_MAX; + } else if ((((b.i16[i]) < 0) && + ((a.i16[i]) < (SHRT_MIN - (b.i16[i]))))) { + r.i16[i] = SHRT_MIN; + } else { + r.i16[i] = (a.i16[i]) + (b.i16[i]); + } + } + return r; +#endif +} +#define simde_m_paddsw(a, b) simde_mm_adds_pi16(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_adds_pu16(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_adds_pu16(a.n, b.n)); +#else + simde__m64 r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) { + const uint32_t x = a.u16[i] + b.u16[i]; + if (x > UINT16_MAX) + r.u16[i] = UINT16_MAX; + else + r.u16[i] = (uint16_t)x; + } + return r; +#endif +} +#define simde_m_paddusw(a, b) simde_mm_adds_pu16(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_and_si64(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_and_si64(a.n, b.n)); +#else + simde__m64 r; + r.i64[0] = a.i64[0] & b.i64[0]; + return r; +#endif +} +#define simde_m_pand(a, b) simde_mm_and_si64(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_andnot_si64(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_andnot_si64(a.n, b.n)); +#else + simde__m64 r; + r.i64[0] = ~(a.i64[0]) & b.i64[0]; + return r; +#endif +} +#define simde_m_pandn(a, b) simde_mm_andnot_si64(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_cmpeq_pi8(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_cmpeq_pi8(a.n, b.n)); +#else + simde__m64 r; + SIMDE__VECTORIZE + for (int i = 0; i < 8; i++) { + r.i8[i] = (a.i8[i] == b.i8[i]) * 0xff; + } + return r; +#endif +} +#define simde_m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_cmpeq_pi16(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_cmpeq_pi16(a.n, b.n)); +#else + simde__m64 r; + SIMDE__VECTORIZE + for (int i = 0; i < 4; i++) { + r.i16[i] = (a.i16[i] == b.i16[i]) * 0xffff; + } + return r; +#endif +} +#define simde_m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_cmpeq_pi32(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_cmpeq_pi32(a.n, b.n)); +#else + simde__m64 r; + SIMDE__VECTORIZE + for (int i = 0; i < 2; i++) { + r.i32[i] = (a.i32[i] == b.i32[i]) * 0xffffffff; + } + return r; +#endif +} +#define simde_m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_cmpgt_pi8(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_cmpgt_pi8(a.n, b.n)); +#else + simde__m64 r; + SIMDE__VECTORIZE + for (int i = 0; i < 8; i++) { + r.i8[i] = (a.i8[i] > b.i8[i]) * 0xff; + } + return r; +#endif +} +#define simde_m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_cmpgt_pi16(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_cmpgt_pi16(a.n, b.n)); +#else + simde__m64 r; + SIMDE__VECTORIZE + for (int i = 0; i < 4; i++) { + r.i16[i] = (a.i16[i] > b.i16[i]) * 0xffff; + } + return r; +#endif +} +#define simde_m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_cmpgt_pi32(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_cmpgt_pi32(a.n, b.n)); +#else + simde__m64 r; + SIMDE__VECTORIZE + for (int i = 0; i < 2; i++) { + r.i32[i] = (a.i32[i] > b.i32[i]) * 0xffffffff; + } + return r; +#endif +} +#define simde_m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +int64_t simde_mm_cvtm64_si64(simde__m64 a) +{ +#if defined(SIMDE_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(__PGI) + return _mm_cvtm64_si64(a.n); +#else + return a.i64[0]; +#endif +} +#define simde_m_to_int64(a) simde_mm_cvtm64_si64(a) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_cvtsi32_si64(int32_t a) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_cvtsi32_si64(a)); +#else + simde__m64 r; + r.i32[0] = a; + r.i32[1] = 0; + return r; +#endif +} +#define simde_m_from_int(a) simde_mm_cvtsi32_si64(a) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_cvtsi64_m64(int64_t a) +{ +#if defined(SIMDE_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(__PGI) + return SIMDE__M64_C(_mm_cvtsi64_m64(a)); +#else + simde__m64 r; + r.i64[0] = a; + return r; +#endif +} +#define simde_m_from_int64(a) simde_mm_cvtsi64_m64(a) + +SIMDE__FUNCTION_ATTRIBUTES +int32_t simde_mm_cvtsi64_si32(simde__m64 a) +{ +#if defined(SIMDE_MMX_NATIVE) + return _mm_cvtsi64_si32(a.n); +#else + return a.i32[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_empty(void) +{ +#if defined(SIMDE_MMX_NATIVE) + _mm_empty(); +#else +#endif +} +#define simde_m_empty() simde_mm_empty() + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_madd_pi16(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_madd_pi16(a.n, b.n)); +#else + simde__m64 r; + SIMDE__VECTORIZE + for (int i = 0; i < 4; i += 2) { + r.i32[i / 2] = + (a.i16[i] * b.i16[i]) + (a.i16[i + 1] * b.i16[i + 1]); + } + return r; +#endif +} +#define simde_m_pmaddwd(a, b) simde_mm_madd_pi16(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_mulhi_pi16(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_mulhi_pi16(a.n, b.n)); +#else + simde__m64 r; + SIMDE__VECTORIZE + for (int i = 0; i < 4; i++) { + r.i16[i] = (int16_t)((a.i16[i] * b.i16[i]) >> 16); + } + return r; +#endif +} +#define simde_m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_mullo_pi16(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_mullo_pi16(a.n, b.n)); +#else + simde__m64 r; + SIMDE__VECTORIZE + for (int i = 0; i < 4; i++) { + r.i16[i] = (int16_t)((a.i16[i] * b.i16[i]) & 0xffff); + } + return r; +#endif +} +#define simde_m_pmullw(a, b) simde_mm_mullo_pi16(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_or_si64(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_or_si64(a.n, b.n)); +#else + simde__m64 r; + r.i64[0] = a.i64[0] | b.i64[0]; + return r; +#endif +} +#define simde_m_por(a, b) simde_mm_or_si64(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_packs_pi16(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_packs_pi16(a.n, b.n)); +#else + simde__m64 r; + + SIMDE__VECTORIZE + for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) { + if (a.i16[i] < INT8_MIN) { + r.i8[i] = INT8_MIN; + } else if (a.i16[i] > INT8_MAX) { + r.i8[i] = INT8_MAX; + } else { + r.i8[i] = (int8_t)a.i16[i]; + } + } + + SIMDE__VECTORIZE + for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) { + if (b.i16[i] < INT8_MIN) { + r.i8[i + 4] = INT8_MIN; + } else if (b.i16[i] > INT8_MAX) { + r.i8[i + 4] = INT8_MAX; + } else { + r.i8[i + 4] = (int8_t)b.i16[i]; + } + } + + return r; +#endif +} +#define simde_m_packsswb(a, b) simde_mm_packs_pi16(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_packs_pi32(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_packs_pi32(a.n, b.n)); +#else + simde__m64 r; + + SIMDE__VECTORIZE + for (size_t i = 0; i < (8 / sizeof(a.i32[0])); i++) { + if (a.i32[i] < SHRT_MIN) { + r.i16[i] = SHRT_MIN; + } else if (a.i32[i] > INT16_MAX) { + r.i16[i] = INT16_MAX; + } else { + r.i16[i] = (int16_t)a.i32[i]; + } + } + + SIMDE__VECTORIZE + for (size_t i = 0; i < (8 / sizeof(b.i32[0])); i++) { + if (b.i32[i] < SHRT_MIN) { + r.i16[i + 2] = SHRT_MIN; + } else if (b.i32[i] > INT16_MAX) { + r.i16[i + 2] = INT16_MAX; + } else { + r.i16[i + 2] = (int16_t)b.i32[i]; + } + } + + return r; +#endif +} +#define simde_m_packssdw(a, b) simde_mm_packs_pi32(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_packs_pu16(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_packs_pu16(a.n, b.n)); +#else + simde__m64 r; + + SIMDE__VECTORIZE + for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) { + if (a.i16[i] > UINT8_MAX) { + r.u8[i] = UINT8_MAX; + } else if (a.i16[i] < 0) { + r.u8[i] = 0; + } else { + r.u8[i] = (int8_t)a.i16[i]; + } + } + + SIMDE__VECTORIZE + for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) { + if (b.i16[i] > UINT8_MAX) { + r.u8[i + 4] = UINT8_MAX; + } else if (b.i16[i] < 0) { + r.u8[i + 4] = 0; + } else { + r.u8[i + 4] = (int8_t)b.i16[i]; + } + } + + return r; +#endif +} +#define simde_m_packuswb(a, b) simde_mm_packs_pu16(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_set_pi8(int8_t e7, int8_t e6, int8_t e5, int8_t e4, + int8_t e3, int8_t e2, int8_t e1, int8_t e0) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0)); +#else + simde__m64 r; + r.i8[0] = e0; + r.i8[1] = e1; + r.i8[2] = e2; + r.i8[3] = e3; + r.i8[4] = e4; + r.i8[5] = e5; + r.i8[6] = e6; + r.i8[7] = e7; + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_x_mm_set_pu8(uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4, + uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_set_pi8((int8_t)e7, (int8_t)e6, (int8_t)e5, + (int8_t)e4, (int8_t)e3, (int8_t)e2, + (int8_t)e1, (int8_t)e0)); +#else + simde__m64 r; + r.u8[0] = e0; + r.u8[1] = e1; + r.u8[2] = e2; + r.u8[3] = e3; + r.u8[4] = e4; + r.u8[5] = e5; + r.u8[6] = e6; + r.u8[7] = e7; + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_set_pi16(int16_t e3, int16_t e2, int16_t e1, int16_t e0) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_set_pi16(e3, e2, e1, e0)); +#else + simde__m64 r; + r.i16[0] = e0; + r.i16[1] = e1; + r.i16[2] = e2; + r.i16[3] = e3; + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_x_mm_set_pu16(uint16_t e3, uint16_t e2, uint16_t e1, + uint16_t e0) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_set_pi16((int16_t)e3, (int16_t)e2, (int16_t)e1, + (int16_t)e0)); +#else + simde__m64 r; + r.u16[0] = e0; + r.u16[1] = e1; + r.u16[2] = e2; + r.u16[3] = e3; + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_x_mm_set_pu32(uint32_t e1, uint32_t e0) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_set_pi32((int32_t)e1, (int32_t)e0)); +#else + simde__m64 r; + r.u32[0] = e0; + r.u32[1] = e1; + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_set_pi32(int32_t e1, int32_t e0) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_set_pi32(e1, e0)); +#else + simde__m64 r; + r.i32[0] = e0; + r.i32[1] = e1; + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_set1_pi8(int8_t a) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_set1_pi8(a)); +#else + return simde_mm_set_pi8(a, a, a, a, a, a, a, a); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_set1_pi16(int16_t a) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_set1_pi16(a)); +#else + return simde_mm_set_pi16(a, a, a, a); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_set1_pi32(int32_t a) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_set1_pi32(a)); +#else + return simde_mm_set_pi32(a, a); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_setr_pi8(int8_t e7, int8_t e6, int8_t e5, int8_t e4, + int8_t e3, int8_t e2, int8_t e1, int8_t e0) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0)); +#else + return simde_mm_set_pi8(e0, e1, e2, e3, e4, e5, e6, e7); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_setr_pi16(int16_t e3, int16_t e2, int16_t e1, int16_t e0) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_setr_pi16(e3, e2, e1, e0)); +#else + return simde_mm_set_pi16(e0, e1, e2, e3); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_setr_pi32(int32_t e1, int32_t e0) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_setr_pi32(e1, e0)); +#else + return simde_mm_set_pi32(e0, e1); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_setzero_si64(void) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_setzero_si64()); +#else + return simde_mm_set_pi32(0, 0); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_sll_pi16(simde__m64 a, simde__m64 count) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_sll_pi16(a.n, count.n)); +#else + simde__m64 r; + + if (HEDLEY_UNLIKELY(count.u64[0] > 15)) { + memset(&r, 0, sizeof(r)); + return r; + } + + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) { + r.u16[i] = a.u16[i] << count.u64[0]; + } + return r; +#endif +} +#define simde_m_psllw(a, count) simde_mm_sll_pi16(a, count) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_sll_pi32(simde__m64 a, simde__m64 count) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_sll_pi32(a.n, count.n)); +#else + simde__m64 r; + + if (HEDLEY_UNLIKELY(count.u64[0] > 31)) { + memset(&r, 0, sizeof(r)); + return r; + } + + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.u32) / sizeof(r.u32[0])); i++) { + r.u32[i] = a.u32[i] << count.u64[0]; + } + return r; +#endif +} +#define simde_m_pslld(a, count) simde_mm_sll_pi32(a, count) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_slli_pi16(simde__m64 a, int count) +{ +#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI) + return SIMDE__M64_C(_mm_slli_pi16(a.n, count)); +#else + simde__m64 r; + + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) { + r.u16[i] = a.u16[i] << count; + } + + return r; +#endif +} +#define simde_m_psllwi(a, count) simde_mm_slli_pi16(a, count) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_slli_pi32(simde__m64 a, int count) +{ +#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI) + return SIMDE__M64_C(_mm_slli_pi32(a.n, count)); +#else + simde__m64 r; + + SIMDE__VECTORIZE + for (size_t i = 0; i < (8 / sizeof(int)); i++) { + r.u32[i] = a.u32[i] << count; + } + + return r; +#endif +} +#define simde_m_pslldi(a, b) simde_mm_slli_pi32(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_slli_si64(simde__m64 a, int count) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_slli_si64(a.n, count)); +#else + simde__m64 r; + r.u64[0] = a.u64[0] << count; + return r; +#endif +} +#define simde_m_psllqi(a, count) simde_mm_slli_si64(a, count) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_sll_si64(simde__m64 a, simde__m64 count) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_sll_si64(a.n, count.n)); +#else + simde__m64 r; + + if (HEDLEY_UNLIKELY(count.u64[0] > 63)) { + memset(&r, 0, sizeof(r)); + return r; + } + + r.u64[0] = a.u64[0] << count.u64[0]; + + return r; +#endif +} +#define simde_m_psllq(a, count) simde_mm_sll_si64(a, count) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_srl_pi16(simde__m64 a, simde__m64 count) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_srl_pi16(a.n, count.n)); +#else + simde__m64 r; + + if (HEDLEY_UNLIKELY(count.u64[0] > 15)) { + memset(&r, 0, sizeof(r)); + return r; + } + + SIMDE__VECTORIZE + for (size_t i = 0; i < sizeof(r.u16) / sizeof(r.u16[0]); i++) { + r.u16[i] = a.u16[i] >> count.u64[0]; + } + return r; +#endif +} +#define simde_m_psrlw(a, count) simde_mm_srl_pi16(a, count) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_srl_pi32(simde__m64 a, simde__m64 count) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_srl_pi32(a.n, count.n)); +#else + simde__m64 r; + + if (HEDLEY_UNLIKELY(count.u64[0] > 31)) { + memset(&r, 0, sizeof(r)); + return r; + } + + SIMDE__VECTORIZE + for (size_t i = 0; i < sizeof(r.u32) / sizeof(r.u32[0]); i++) { + r.u32[i] = a.u32[i] >> count.u64[0]; + } + return r; +#endif +} +#define simde_m_psrld(a, count) simde_mm_srl_pi32(a, count) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_srli_pi16(simde__m64 a, int count) +{ +#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI) + return SIMDE__M64_C(_mm_srli_pi16(a.n, count)); +#else + simde__m64 r; + + SIMDE__VECTORIZE + for (size_t i = 0; i < (8 / sizeof(uint16_t)); i++) { + r.u16[i] = a.u16[i] >> count; + } + + return r; +#endif +} +#define simde_m_psrlwi(a, count) simde_mm_srli_pi16(a, count) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_srli_pi32(simde__m64 a, int count) +{ +#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI) + return SIMDE__M64_C(_mm_srli_pi32(a.n, count)); +#else + simde__m64 r; + + SIMDE__VECTORIZE + for (size_t i = 0; i < (8 / sizeof(int)); i++) { + r.u32[i] = a.u32[i] >> count; + } + + return r; +#endif +} +#define simde_m_psrldi(a, count) simde_mm_srli_pi32(a, count) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_srli_si64(simde__m64 a, int count) +{ +#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI) + return SIMDE__M64_C(_mm_srli_si64(a.n, count)); +#else + simde__m64 r; + r.u64[0] = a.u64[0] >> count; + return r; +#endif +} +#define simde_m_psrlqi(a, count) simde_mm_srli_si64(a, count) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_srl_si64(simde__m64 a, simde__m64 count) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_srl_si64(a.n, count.n)); +#else + simde__m64 r; + + if (HEDLEY_UNLIKELY(count.u64[0] > 63)) { + memset(&r, 0, sizeof(r)); + return r; + } + + r.u64[0] = a.u64[0] >> count.u64[0]; + return r; +#endif +} +#define simde_m_psrlq(a, count) simde_mm_srl_si64(a, count) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_srai_pi16(simde__m64 a, int count) +{ +#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI) + return SIMDE__M64_C(_mm_srai_pi16(a.n, count)); +#else + simde__m64 r; + + const uint16_t m = + (uint16_t)((~0U) << ((sizeof(int16_t) * CHAR_BIT) - count)); + + SIMDE__VECTORIZE + for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) { + const uint16_t is_neg = ((uint16_t)( + ((a.u16[i]) >> ((sizeof(int16_t) * CHAR_BIT) - 1)))); + r.u16[i] = (a.u16[i] >> count) | (m * is_neg); + } + + return r; +#endif +} +#define simde_m_psrawi(a, count) simde_mm_srai_pi16(a, count) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_srai_pi32(simde__m64 a, int count) +{ +#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI) + return SIMDE__M64_C(_mm_srai_pi32(a.n, count)); +#else + simde__m64 r; + + const uint32_t m = + (uint32_t)((~0U) << ((sizeof(int) * CHAR_BIT) - count)); + SIMDE__VECTORIZE + for (size_t i = 0; i < (8 / sizeof(int)); i++) { + const uint32_t is_neg = ((uint32_t)( + ((a.u32[i]) >> ((sizeof(int) * CHAR_BIT) - 1)))); + r.u32[i] = (a.u32[i] >> count) | (m * is_neg); + } + + return r; +#endif +} +#define simde_m_srai_pi32(a, count) simde_mm_srai_pi32(a, count) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_sra_pi16(simde__m64 a, simde__m64 count) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_sra_pi16(a.n, count.n)); +#else + simde__m64 r; + int cnt = (int)count.i64[0]; + + if (cnt > 15 || cnt < 0) { + for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); + i++) { + r.u16[i] = (a.i16[i] < 0) ? 0xffff : 0x0000; + } + } else { + const uint16_t m = (uint16_t)( + (~0U) << ((sizeof(int16_t) * CHAR_BIT) - cnt)); + for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); + i++) { + const uint16_t is_neg = a.i16[i] < 0; + r.u16[i] = (a.u16[i] >> cnt) | (m * is_neg); + } + } + + return r; +#endif +} +#define simde_m_psraw(a, count) simde_mm_sra_pi16(a, count) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_sra_pi32(simde__m64 a, simde__m64 count) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_sra_pi32(a.n, count.n)); +#else + simde__m64 r; + const uint64_t cnt = count.u64[0]; + + if (cnt > 31) { + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); + i++) { + r.u32[i] = (a.i32[i] < 0) ? UINT32_MAX : 0; + } + } else if (cnt == 0) { + memcpy(&r, &a, sizeof(r)); + } else { + const uint32_t m = (uint32_t)( + (~0U) << ((sizeof(int32_t) * CHAR_BIT) - cnt)); + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); + i++) { + const uint32_t is_neg = a.i32[i] < 0; + r.u32[i] = (a.u32[i] >> cnt) | (m * is_neg); + } + } + + return r; +#endif +} +#define simde_m_psrad(a, b) simde_mm_sra_pi32(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_sub_pi8(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_sub_pi8(a.n, b.n)); +#else + simde__m64 r; + SIMDE__VECTORIZE + for (size_t i = 0; i < 8; i++) { + r.i8[i] = a.i8[i] - b.i8[i]; + } + return r; +#endif +} +#define simde_m_psubb(a, b) simde_mm_sub_pi8(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_sub_pi16(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_sub_pi16(a.n, b.n)); +#else + simde__m64 r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) { + r.i16[i] = a.i16[i] - b.i16[i]; + } + return r; +#endif +} +#define simde_m_psubw(a, b) simde_mm_sub_pi16(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_sub_pi32(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_sub_pi32(a.n, b.n)); +#else + simde__m64 r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (8 / sizeof(int)); i++) { + r.i32[i] = a.i32[i] - b.i32[i]; + } + return r; +#endif +} +#define simde_m_psubd(a, b) simde_mm_sub_pi32(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_subs_pi8(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_subs_pi8(a.n, b.n)); +#else + simde__m64 r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (8); i++) { + if (((b.i8[i]) > 0 && (a.i8[i]) < INT8_MIN + (b.i8[i]))) { + r.i8[i] = INT8_MIN; + } else if ((b.i8[i]) < 0 && (a.i8[i]) > INT8_MAX + (b.i8[i])) { + r.i8[i] = INT8_MAX; + } else { + r.i8[i] = (a.i8[i]) - (b.i8[i]); + } + } + return r; +#endif +} +#define simde_m_psubsb(a, b) simde_mm_subs_pi8(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_subs_pu8(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_subs_pu8(a.n, b.n)); +#else + simde__m64 r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (8); i++) { + const int32_t x = a.u8[i] - b.u8[i]; + if (x < 0) { + r.u8[i] = 0; + } else if (x > UINT8_MAX) { + r.u8[i] = UINT8_MAX; + } else { + r.u8[i] = (uint8_t)x; + } + } + return r; +#endif +} +#define simde_m_psubusb(a, b) simde_mm_subs_pu8(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_subs_pi16(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_subs_pi16(a.n, b.n)); +#else + simde__m64 r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) { + if (((b.i16[i]) > 0 && (a.i16[i]) < SHRT_MIN + (b.i16[i]))) { + r.i16[i] = SHRT_MIN; + } else if ((b.i16[i]) < 0 && + (a.i16[i]) > INT16_MAX + (b.i16[i])) { + r.i16[i] = INT16_MAX; + } else { + r.i16[i] = (a.i16[i]) - (b.i16[i]); + } + } + return r; +#endif +} +#define simde_m_psubsw(a, b) simde_mm_subs_pi16(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_subs_pu16(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_subs_pu16(a.n, b.n)); +#else + simde__m64 r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (8 / sizeof(uint16_t)); i++) { + const int x = a.u16[i] - b.u16[i]; + if (x < 0) { + r.u16[i] = 0; + } else if (x > UINT16_MAX) { + r.u16[i] = UINT16_MAX; + } else { + r.u16[i] = (uint16_t)x; + } + } + return r; +#endif +} +#define simde_m_psubusw(a, b) simde_mm_subs_pu16(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_unpackhi_pi8(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_unpackhi_pi8(a.n, b.n)); +#else + simde__m64 r; + r.i8[0] = a.i8[4]; + r.i8[1] = b.i8[4]; + r.i8[2] = a.i8[5]; + r.i8[3] = b.i8[5]; + r.i8[4] = a.i8[6]; + r.i8[5] = b.i8[6]; + r.i8[6] = a.i8[7]; + r.i8[7] = b.i8[7]; + return r; +#endif +} +#define simde_m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_unpackhi_pi16(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_unpackhi_pi16(a.n, b.n)); +#else + simde__m64 r; + r.i16[0] = a.i16[2]; + r.i16[1] = b.i16[2]; + r.i16[2] = a.i16[3]; + r.i16[3] = b.i16[3]; + return r; +#endif +} +#define simde_m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_unpackhi_pi32(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_unpackhi_pi32(a.n, b.n)); +#else + simde__m64 r; + r.i32[0] = a.i32[1]; + r.i32[1] = b.i32[1]; + return r; +#endif +} +#define simde_m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_unpacklo_pi8(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_unpacklo_pi8(a.n, b.n)); +#else + simde__m64 r; + r.i8[0] = a.i8[0]; + r.i8[1] = b.i8[0]; + r.i8[2] = a.i8[1]; + r.i8[3] = b.i8[1]; + r.i8[4] = a.i8[2]; + r.i8[5] = b.i8[2]; + r.i8[6] = a.i8[3]; + r.i8[7] = b.i8[3]; + return r; +#endif +} +#define simde_m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_unpacklo_pi16(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_unpacklo_pi16(a.n, b.n)); +#else + simde__m64 r; + r.i16[0] = a.i16[0]; + r.i16[1] = b.i16[0]; + r.i16[2] = a.i16[1]; + r.i16[3] = b.i16[1]; + return r; +#endif +} +#define simde_m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_unpacklo_pi32(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_unpacklo_pi32(a.n, b.n)); +#else + simde__m64 r; + r.i32[0] = a.i32[0]; + r.i32[1] = b.i32[0]; + return r; +#endif +} +#define simde_m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_xor_si64(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_C(_mm_xor_si64(a.n, b.n)); +#else + simde__m64 r; + r.i64[0] = a.i64[0] ^ b.i64[0]; + return r; +#endif +} +#define simde_m_pxor(a, b) simde_mm_xor_si64(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +int32_t simde_m_to_int(simde__m64 a) +{ +#if defined(SIMDE_MMX_NATIVE) + return _m_to_int(a.n); +#else + return a.i32[0]; +#endif +} + +SIMDE__END_DECLS + +#endif /* !defined(SIMDE__MMX_H) */ diff --git a/libobs/util/aarch/simde-arch.h b/libobs/util/aarch/simde-arch.h new file mode 100644 index 000000000..532304cda --- /dev/null +++ b/libobs/util/aarch/simde-arch.h @@ -0,0 +1,355 @@ +/* Architecture detection + * Created by Evan Nemerson + * + * To the extent possible under law, the authors have waived all + * copyright and related or neighboring rights to this code. For + * details, see the Creative Commons Zero 1.0 Universal license at + * + * + * Different compilers define different preprocessor macros for the + * same architecture. This is an attempt to provide a single + * interface which is usable on any compiler. + * + * In general, a macro named SIMDE_ARCH_* is defined for each + * architecture the CPU supports. When there are multiple possible + * versions, we try to define the macro to the target version. For + * example, if you want to check for i586+, you could do something + * like: + * + * #if defined(SIMDE_ARCH_X86) && (SIMDE_ARCH_X86 >= 5) + * ... + * #endif + * + * You could also just check that SIMDE_ARCH_X86 >= 5 without checking + * if it's defined first, but some compilers may emit a warning about + * an undefined macro being used (e.g., GCC with -Wundef). + * + * This was originally created for SIMDe + * (hence the prefix), but this + * header has no dependencies and may be used anywhere. It is + * originally based on information from + * , though it + * has been enhanced with additional information. + * + * If you improve this file, or find a bug, please file the issue at + * . If you copy this into + * your project, even if you change the prefix, please keep the links + * to SIMDe intact so others know where to report issues, submit + * enhancements, and find the latest version. */ + +#if !defined(SIMDE_ARCH_H) +#define SIMDE_ARCH_H + +/* Alpha + */ +#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA) +#if defined(__alpha_ev6__) +#define SIMDE_ARCH_ALPHA 6 +#elif defined(__alpha_ev5__) +#define SIMDE_ARCH_ALPHA 5 +#elif defined(__alpha_ev4__) +#define SIMDE_ARCH_ALPHA 4 +#else +#define SIMDE_ARCH_ALPHA 1 +#endif +#endif + +/* Atmel AVR + */ +#if defined(__AVR_ARCH__) +#define SIMDE_ARCH_AVR __AVR_ARCH__ +#endif + +/* AMD64 / x86_64 + */ +#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || \ + defined(__x86_64) || defined(_M_X66) || defined(_M_AMD64) +#define SIMDE_ARCH_AMD64 1 +#endif + +/* ARM + */ +#if defined(__ARM_ARCH_8A__) +#define SIMDE_ARCH_ARM 82 +#elif defined(__ARM_ARCH_8R__) +#define SIMDE_ARCH_ARM 81 +#elif defined(__ARM_ARCH_8__) +#define SIMDE_ARCH_ARM 80 +#elif defined(__ARM_ARCH_7S__) +#define SIMDE_ARCH_ARM 74 +#elif defined(__ARM_ARCH_7M__) +#define SIMDE_ARCH_ARM 73 +#elif defined(__ARM_ARCH_7R__) +#define SIMDE_ARCH_ARM 72 +#elif defined(__ARM_ARCH_7A__) +#define SIMDE_ARCH_ARM 71 +#elif defined(__ARM_ARCH_7__) +#define SIMDE_ARCH_ARM 70 +#elif defined(__ARM_ARCH) +#define SIMDE_ARCH_ARM (__ARM_ARCH * 10) +#elif defined(_M_ARM) +#define SIMDE_ARCH_ARM (_M_ARM * 10) +#elif defined(__arm__) || defined(__thumb__) || defined(__TARGET_ARCH_ARM) || \ + defined(_ARM) || defined(_M_ARM) || defined(_M_ARM) +#define SIMDE_ARCH_ARM 1 +#endif + +/* AArch64 + */ +#if defined(__aarch64__) || defined(_M_ARM64) +#define SIMDE_ARCH_AARCH64 10 +#endif + +/* Blackfin + */ +#if defined(__bfin) || defined(__BFIN__) || defined(__bfin__) +#define SIMDE_ARCH_BLACKFIN 1 +#endif + +/* CRIS + */ +#if defined(__CRIS_arch_version) +#define SIMDE_ARCH_CRIS __CRIS_arch_version +#elif defined(__cris__) || defined(__cris) || defined(__CRIS) || \ + defined(__CRIS__) +#define SIMDE_ARCH_CRIS 1 +#endif + +/* Convex + */ +#if defined(__convex_c38__) +#define SIMDE_ARCH_CONVEX 38 +#elif defined(__convex_c34__) +#define SIMDE_ARCH_CONVEX 34 +#elif defined(__convex_c32__) +#define SIMDE_ARCH_CONVEX 32 +#elif defined(__convex_c2__) +#define SIMDE_ARCH_CONVEX 2 +#elif defined(__convex__) +#define SIMDE_ARCH_CONVEX 1 +#endif + +/* Adapteva Epiphany + */ +#if defined(__epiphany__) +#define SIMDE_ARCH_EPIPHANY 1 +#endif + +/* Fujitsu FR-V + */ +#if defined(__frv__) +#define SIMDE_ARCH_FRV 1 +#endif + +/* H8/300 + */ +#if defined(__H8300__) +#define SIMDE_ARCH_H8300 +#endif + +/* HP/PA / PA-RISC + */ +#if defined(__PA8000__) || defined(__HPPA20__) || defined(__RISC2_0__) || \ + defined(_PA_RISC2_0) +#define SIMDE_ARCH_HPPA 20 +#elif defined(__PA7100__) || defined(__HPPA11__) || defined(_PA_RISC1_1) +#define SIMDE_ARCH_HPPA 11 +#elif defined(_PA_RISC1_0) +#define SIMDE_ARCH_HPPA 10 +#elif defined(__hppa__) || defined(__HPPA__) || defined(__hppa) +#define SIMDE_ARCH_HPPA 1 +#endif + +/* x86 + */ +#if defined(_M_IX86) +#define SIMDE_ARCH_X86 (_M_IX86 / 100) +#elif defined(__I86__) +#define SIMDE_ARCH_X86 __I86__ +#elif defined(i686) || defined(__i686) || defined(__i686__) +#define SIMDE_ARCH_X86 6 +#elif defined(i586) || defined(__i586) || defined(__i586__) +#define SIMDE_ARCH_X86 5 +#elif defined(i486) || defined(__i486) || defined(__i486__) +#define SIMDE_ARCH_X86 4 +#elif defined(i386) || defined(__i386) || defined(__i386__) +#define SIMDE_ARCH_X86 3 +#elif defined(_X86_) || defined(__X86__) || defined(__THW_INTEL__) +#define SIMDE_ARCH_X86 3 +#endif + +/* Itanium + */ +#if defined(__ia64__) || defined(_IA64) || defined(__IA64__) || \ + defined(__ia64) || defined(_M_IA64) || defined(__itanium__) +#define SIMDE_ARCH_IA64 1 +#endif + +/* Renesas M32R + */ +#if defined(__m32r__) || defined(__M32R__) +#define SIMDE_ARCH_M32R +#endif + +/* Motorola 68000 + */ +#if defined(__mc68060__) || defined(__MC68060__) +#define SIMDE_ARCH_M68K 68060 +#elif defined(__mc68040__) || defined(__MC68040__) +#define SIMDE_ARCH_M68K 68040 +#elif defined(__mc68030__) || defined(__MC68030__) +#define SIMDE_ARCH_M68K 68030 +#elif defined(__mc68020__) || defined(__MC68020__) +#define SIMDE_ARCH_M68K 68020 +#elif defined(__mc68010__) || defined(__MC68010__) +#define SIMDE_ARCH_M68K 68010 +#elif defined(__mc68000__) || defined(__MC68000__) +#define SIMDE_ARCH_M68K 68000 +#endif + +/* Xilinx MicroBlaze + */ +#if defined(__MICROBLAZE__) || defined(__microblaze__) +#define SIMDE_ARCH_MICROBLAZE +#endif + +/* MIPS + */ +#if defined(_MIPS_ISA_MIPS64R2) +#define SIMDE_ARCH_MIPS 642 +#elif defined(_MIPS_ISA_MIPS64) +#define SIMDE_ARCH_MIPS 640 +#elif defined(_MIPS_ISA_MIPS32R2) +#define SIMDE_ARCH_MIPS 322 +#elif defined(_MIPS_ISA_MIPS32) +#define SIMDE_ARCH_MIPS 320 +#elif defined(_MIPS_ISA_MIPS4) +#define SIMDE_ARCH_MIPS 4 +#elif defined(_MIPS_ISA_MIPS3) +#define SIMDE_ARCH_MIPS 3 +#elif defined(_MIPS_ISA_MIPS2) +#define SIMDE_ARCH_MIPS 2 +#elif defined(_MIPS_ISA_MIPS1) +#define SIMDE_ARCH_MIPS 1 +#elif defined(_MIPS_ISA_MIPS) || defined(__mips) || defined(__MIPS__) +#define SIMDE_ARCH_MIPS 1 +#endif + +/* Matsushita MN10300 + */ +#if defined(__MN10300__) || defined(__mn10300__) +#define SIMDE_ARCH_MN10300 1 +#endif + +/* POWER + */ +#if defined(_M_PPC) +#define SIMDE_ARCH_POWER _M_PPC +#elif defined(_ARCH_PWR8) +#define SIMDE_ARCH_POWER 800 +#elif defined(_ARCH_PWR7) +#define SIMDE_ARCH_POWER 700 +#elif defined(_ARCH_PWR6) +#define SIMDE_ARCH_POWER 600 +#elif defined(_ARCH_PWR5) +#define SIMDE_ARCH_POWER 500 +#elif defined(_ARCH_PWR4) +#define SIMDE_ARCH_POWER 400 +#elif defined(_ARCH_440) || defined(__ppc440__) +#define SIMDE_ARCH_POWER 440 +#elif defined(_ARCH_450) || defined(__ppc450__) +#define SIMDE_ARCH_POWER 450 +#elif defined(_ARCH_601) || defined(__ppc601__) +#define SIMDE_ARCH_POWER 601 +#elif defined(_ARCH_603) || defined(__ppc603__) +#define SIMDE_ARCH_POWER 603 +#elif defined(_ARCH_604) || defined(__ppc604__) +#define SIMDE_ARCH_POWER 604 +#elif defined(_ARCH_605) || defined(__ppc605__) +#define SIMDE_ARCH_POWER 605 +#elif defined(_ARCH_620) || defined(__ppc620__) +#define SIMDE_ARCH_POWER 620 +#elif defined(__powerpc) || defined(__powerpc__) || defined(__POWERPC__) || \ + defined(__ppc__) || defined(__PPC__) || defined(_ARCH_PPC) || \ + defined(__ppc) +#define SIMDE_ARCH_POWER 1 +#endif + +/* SPARC + */ +#if defined(__sparc_v9__) || defined(__sparcv9) +#define SIMDE_ARCH_SPARC 9 +#elif defined(__sparc_v8__) || defined(__sparcv8) +#define SIMDE_ARCH_SPARC 8 +#elif defined(__sparc_v7__) || defined(__sparcv7) +#define SIMDE_ARCH_SPARC 7 +#elif defined(__sparc_v6__) || defined(__sparcv6) +#define SIMDE_ARCH_SPARC 6 +#elif defined(__sparc_v5__) || defined(__sparcv5) +#define SIMDE_ARCH_SPARC 5 +#elif defined(__sparc_v4__) || defined(__sparcv4) +#define SIMDE_ARCH_SPARC 4 +#elif defined(__sparc_v3__) || defined(__sparcv3) +#define SIMDE_ARCH_SPARC 3 +#elif defined(__sparc_v2__) || defined(__sparcv2) +#define SIMDE_ARCH_SPARC 2 +#elif defined(__sparc_v1__) || defined(__sparcv1) +#define SIMDE_ARCH_SPARC 1 +#elif defined(__sparc__) || defined(__sparc) +#define SIMDE_ARCH_SPARC 1 +#endif + +/* SuperH + */ +#if defined(__sh5__) || defined(__SH5__) +#define SIMDE_ARCH_SUPERH 5 +#elif defined(__sh4__) || defined(__SH4__) +#define SIMDE_ARCH_SUPERH 4 +#elif defined(__sh3__) || defined(__SH3__) +#define SIMDE_ARCH_SUPERH 3 +#elif defined(__sh2__) || defined(__SH2__) +#define SIMDE_ARCH_SUPERH 2 +#elif defined(__sh1__) || defined(__SH1__) +#define SIMDE_ARCH_SUPERH 1 +#elif defined(__sh__) || defined(__SH__) +#define SIMDE_ARCH_SUPERH 1 +#endif + +/* IBM System z + */ +#if defined(__370__) || defined(__THW_370__) || defined(__s390__) || \ + defined(__s390x__) || defined(__zarch__) || defined(__SYSC_ZARCH__) +#define SIMDE_ARCH_SYSTEMZ +#endif + +/* TMS320 DSP + */ +#if defined(_TMS320C6740) || defined(__TMS320C6740__) +#define SIMDE_ARCH_TMS320 6740 +#elif defined(_TMS320C6700_PLUS) || defined(__TMS320C6700_PLUS__) +#define SIMDE_ARCH_TMS320 6701 +#elif defined(_TMS320C6700) || defined(__TMS320C6700__) +#define SIMDE_ARCH_TMS320 6700 +#elif defined(_TMS320C6600) || defined(__TMS320C6600__) +#define SIMDE_ARCH_TMS320 6600 +#elif defined(_TMS320C6400_PLUS) || defined(__TMS320C6400_PLUS__) +#define SIMDE_ARCH_TMS320 6401 +#elif defined(_TMS320C6400) || defined(__TMS320C6400__) +#define SIMDE_ARCH_TMS320 6400 +#elif defined(_TMS320C6200) || defined(__TMS320C6200__) +#define SIMDE_ARCH_TMS320 6200 +#elif defined(_TMS320C55X) || defined(__TMS320C55X__) +#define SIMDE_ARCH_TMS320 550 +#elif defined(_TMS320C54X) || defined(__TMS320C54X__) +#define SIMDE_ARCH_TMS320 540 +#elif defined(_TMS320C28X) || defined(__TMS320C28X__) +#define SIMDE_ARCH_TMS320 280 +#endif + +/* Xtensa + */ +#if defined(__xtensa__) || defined(__XTENSA__) +#define SIMDE_ARCH_XTENSA 1 +#endif + +#endif /* !defined(SIMDE_ARCH_H) */ diff --git a/libobs/util/aarch/simde-common.h b/libobs/util/aarch/simde-common.h new file mode 100644 index 000000000..7279d54ac --- /dev/null +++ b/libobs/util/aarch/simde-common.h @@ -0,0 +1,278 @@ +/* Copyright (c) 2017-2019 Evan Nemerson + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(SIMDE_COMMON_H) +#define SIMDE_COMMON_H + +#include "hedley.h" +#include "check.h" +#include "simde-arch.h" + +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) +#define SIMDE_ALIGN(alignment) _Alignas(alignment) +#elif (defined(__cplusplus) && (__cplusplus >= 201103L)) +#define SIMDE_ALIGN(alignment) alignas(alignment) +#elif HEDLEY_GCC_VERSION_CHECK(2, 95, 0) || \ + HEDLEY_CRAY_VERSION_CHECK(8, 4, 0) || \ + HEDLEY_IBM_VERSION_CHECK(11, 1, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ + HEDLEY_PGI_VERSION_CHECK(19, 4, 0) || \ + HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ + HEDLEY_TINYC_VERSION_CHECK(0, 9, 24) || \ + HEDLEY_TI_VERSION_CHECK(8, 1, 0) +#define SIMDE_ALIGN(alignment) __attribute__((aligned(alignment))) +#elif defined(_MSC_VER) && (!defined(_M_IX86) || defined(_M_AMD64)) +#define SIMDE_ALIGN(alignment) __declspec(align(alignment)) +#else +#define SIMDE_ALIGN(alignment) +#endif + +#define simde_assert_aligned(alignment, val) \ + simde_assert_int(((uintptr_t)(val)) % (alignment), ==, 0) + +#if HEDLEY_GCC_HAS_ATTRIBUTE(vector_size, 4, 6, 0) +#define SIMDE__ENABLE_GCC_VEC_EXT +#endif + +#if !defined(SIMDE_ENABLE_OPENMP) && \ + ((defined(_OPENMP) && (_OPENMP >= 201307L)) || \ + (defined(_OPENMP_SIMD) && (_OPENMP_SIMD >= 201307L))) +#define SIMDE_ENABLE_OPENMP +#endif + +#if !defined(SIMDE_ENABLE_CILKPLUS) && defined(__cilk) +#define SIMDE_ENABLE_CILKPLUS +#endif + +#if defined(SIMDE_ENABLE_OPENMP) +#define SIMDE__VECTORIZE _Pragma("omp simd") +#define SIMDE__VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(omp simd safelen(l)) +#define SIMDE__VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(omp simd reduction(r)) +#define SIMDE__VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(omp simd aligned(a)) +#elif defined(SIMDE_ENABLE_CILKPLUS) +#define SIMDE__VECTORIZE _Pragma("simd") +#define SIMDE__VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(simd vectorlength(l)) +#define SIMDE__VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(simd reduction(r)) +#define SIMDE__VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(simd aligned(a)) +#elif defined(__INTEL_COMPILER) +#define SIMDE__VECTORIZE _Pragma("simd") +#define SIMDE__VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(simd vectorlength(l)) +#define SIMDE__VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(simd reduction(r)) +#define SIMDE__VECTORIZE_ALIGNED(a) +#elif defined(__clang__) +#define SIMDE__VECTORIZE _Pragma("clang loop vectorize(enable)") +#define SIMDE__VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(clang loop vectorize_width(l)) +#define SIMDE__VECTORIZE_REDUCTION(r) SIMDE__VECTORIZE +#define SIMDE__VECTORIZE_ALIGNED(a) +#elif HEDLEY_GCC_VERSION_CHECK(4, 9, 0) +#define SIMDE__VECTORIZE _Pragma("GCC ivdep") +#define SIMDE__VECTORIZE_SAFELEN(l) SIMDE__VECTORIZE +#define SIMDE__VECTORIZE_REDUCTION(r) SIMDE__VECTORIZE +#define SIMDE__VECTORIZE_ALIGNED(a) +#elif HEDLEY_CRAY_VERSION_CHECK(5, 0, 0) +#define SIMDE__VECTORIZE _Pragma("_CRI ivdep") +#define SIMDE__VECTORIZE_SAFELEN(l) SIMDE__VECTORIZE +#define SIMDE__VECTORIZE_REDUCTION(r) SIMDE__VECTORIZE +#define SIMDE__VECTORIZE_ALIGNED(a) +#else +#define SIMDE__VECTORIZE +#define SIMDE__VECTORIZE_SAFELEN(l) +#define SIMDE__VECTORIZE_REDUCTION(r) +#define SIMDE__VECTORIZE_ALIGNED(a) +#endif + +#if HEDLEY_GCC_HAS_ATTRIBUTE(unused, 3, 1, 0) +#define SIMDE__UNUSED __attribute__((__unused__)) +#else +#define SIMDE__UNUSED +#endif + +#if HEDLEY_GCC_HAS_ATTRIBUTE(artificial, 4, 3, 0) +#define SIMDE__ARTIFICIAL __attribute__((__artificial__)) +#else +#define SIMDE__ARTIFICIAL +#endif + +/* Intended for checking coverage, you should never use this in + production. */ +#if defined(SIMDE_NO_INLINE) +#define SIMDE__FUNCTION_ATTRIBUTES HEDLEY_NEVER_INLINE SIMDE__UNUSED static +#else +#define SIMDE__FUNCTION_ATTRIBUTES HEDLEY_INLINE SIMDE__ARTIFICIAL static +#endif + +#if defined(_MSC_VER) +#define SIMDE__BEGIN_DECLS \ + HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(disable : 4996 4204)) \ + HEDLEY_BEGIN_C_DECLS +#define SIMDE__END_DECLS HEDLEY_DIAGNOSTIC_POP HEDLEY_END_C_DECLS +#else +#define SIMDE__BEGIN_DECLS HEDLEY_BEGIN_C_DECLS +#define SIMDE__END_DECLS HEDLEY_END_C_DECLS +#endif + +#if defined(__SIZEOF_INT128__) +#define SIMDE__HAVE_INT128 +typedef __int128 simde_int128; +typedef unsigned __int128 simde_uint128; +#endif + +/* TODO: we should at least make an attempt to detect the correct + types for simde_float32/float64 instead of just assuming float and + double. */ + +#if !defined(SIMDE_FLOAT32_TYPE) +#define SIMDE_FLOAT32_TYPE float +#define SIMDE_FLOAT32_C(value) value##f +#else +#define SIMDE_FLOAT32_C(value) ((SIMDE_FLOAT32_TYPE)value) +#endif +typedef SIMDE_FLOAT32_TYPE simde_float32; +HEDLEY_STATIC_ASSERT(sizeof(simde_float32) == 4, + "Unable to find 32-bit floating-point type."); + +#if !defined(SIMDE_FLOAT64_TYPE) +#define SIMDE_FLOAT64_TYPE double +#define SIMDE_FLOAT64_C(value) value +#else +#define SIMDE_FLOAT32_C(value) ((SIMDE_FLOAT64_TYPE)value) +#endif +typedef SIMDE_FLOAT64_TYPE simde_float64; +HEDLEY_STATIC_ASSERT(sizeof(simde_float64) == 8, + "Unable to find 64-bit floating-point type."); + +/* Whether to assume that the compiler can auto-vectorize reasonably + well. This will cause SIMDe to attempt to compose vector + operations using more simple vector operations instead of minimize + serial work. + + As an example, consider the _mm_add_ss(a, b) function from SSE, + which returns { a0 + b0, a1, a2, a3 }. This pattern is repeated + for other operations (sub, mul, etc.). + + The naïve implementation would result in loading a0 and b0, adding + them into a temporary variable, then splicing that value into a new + vector with the remaining elements from a. + + On platforms which support vectorization, it's generally faster to + simply perform the operation on the entire vector to avoid having + to move data between SIMD registers and non-SIMD registers. + Basically, instead of the temporary variable being (a0 + b0) it + would be a vector of (a + b), which is then combined with a to form + the result. + + By default, SIMDe will prefer the pure-vector versions if we detect + a vector ISA extension, but this can be overridden by defining + SIMDE_NO_ASSUME_VECTORIZATION. You can also define + SIMDE_ASSUME_VECTORIZATION if you want to force SIMDe to use the + vectorized version. */ +#if !defined(SIMDE_NO_ASSUME_VECTORIZATION) && \ + !defined(SIMDE_ASSUME_VECTORIZATION) +#if defined(__SSE__) || defined(__ARM_NEON) || defined(__mips_msa) || \ + defined(__ALTIVEC__) +#define SIMDE_ASSUME_VECTORIZATION +#endif +#endif + +/* GCC and clang have built-in functions to handle shuffling of + vectors, but the implementations are slightly different. This + macro is just an abstraction over them. Note that elem_size is in + bits but vec_size is in bytes. */ +#if HEDLEY_CLANG_HAS_BUILTIN(__builtin_shufflevector) +#define SIMDE__SHUFFLE_VECTOR(elem_size, vec_size, a, b, ...) \ + __builtin_shufflevector(a, b, __VA_ARGS__) +#elif HEDLEY_GCC_HAS_BUILTIN(__builtin_shuffle, 4, 7, 0) && \ + !defined(__INTEL_COMPILER) +#define SIMDE__SHUFFLE_VECTOR(elem_size, vec_size, a, b, ...) \ + __builtin_shuffle(a, b, \ + (int##elem_size##_t __attribute__( \ + (__vector_size__(vec_size)))){__VA_ARGS__}) +#endif + +/* Some algorithms are iterative, and fewer iterations means less + accuracy. Lower values here will result in faster, but less + accurate, calculations for some functions. */ +#if !defined(SIMDE_ACCURACY_ITERS) +#define SIMDE_ACCURACY_ITERS 2 +#endif + +/* This will probably move into Hedley at some point, but I'd like to + more thoroughly check for other compilers which define __GNUC__ + first. */ +#if defined(SIMDE__REALLY_GCC) +#undef SIMDE__REALLY_GCC +#endif +#if !defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) +#define SIMDE__REALLY_GCC 0 +#else +#define SIMDE__REALLY_GCC 1 +#endif + +#if defined(SIMDE__ASSUME_ALIGNED) +#undef SIMDE__ASSUME_ALIGNED +#endif +#if HEDLEY_INTEL_VERSION_CHECK(9, 0, 0) +#define SIMDE__ASSUME_ALIGNED(ptr, align) __assume_aligned(ptr, align) +#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) +#define SIMDE__ASSUME_ALIGNED(ptr, align) \ + __assume((((char *)ptr) - ((char *)0)) % (align) == 0) +#elif HEDLEY_GCC_HAS_BUILTIN(__builtin_assume_aligned, 4, 7, 0) +#define SIMDE__ASSUME_ALIGNED(ptr, align) \ + (ptr = (__typeof__(ptr))__builtin_assume_aligned((ptr), align)) +#elif HEDLEY_CLANG_HAS_BUILTIN(__builtin_assume) +#define SIMDE__ASSUME_ALIGNED(ptr, align) \ + __builtin_assume((((char *)ptr) - ((char *)0)) % (align) == 0) +#elif HEDLEY_GCC_HAS_BUILTIN(__builtin_unreachable, 4, 5, 0) +#define SIMDE__ASSUME_ALIGNED(ptr, align) \ + ((((char *)ptr) - ((char *)0)) % (align) == 0) \ + ? (1) \ + : (__builtin_unreachable(), 0) +#else +#define SIMDE__ASSUME_ALIGNED(ptr, align) +#endif + +/* Sometimes we run into problems with specific versions of compilers + which make the native versions unusable for us. Often this is due + to missing functions, sometimes buggy implementations, etc. These + macros are how we check for specific bugs. As they are fixed we'll + start only defining them for problematic compiler versions. */ + +#if !defined(SIMDE_IGNORE_COMPILER_BUGS) +#if SIMDE__REALLY_GCC +#if !HEDLEY_GCC_VERSION_CHECK(4, 9, 0) +#define SIMDE_BUG_GCC_REV_208793 +#endif +#if !HEDLEY_GCC_VERSION_CHECK(5, 0, 0) +#define SIMDE_BUG_GCC_BAD_MM_SRA_EPI32 /* TODO: find relevant bug or commit */ +#endif +#if !HEDLEY_GCC_VERSION_CHECK(4, 6, 0) +#define SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8 /* TODO: find relevant bug or commit */ +#endif +#endif +#if defined(__EMSCRIPTEN__) +#define SIMDE_BUG_EMSCRIPTEN_MISSING_IMPL /* Placeholder for (as yet) unfiled issues. */ +#define SIMDE_BUG_EMSCRIPTEN_5242 +#endif +#endif + +#endif /* !defined(SIMDE_COMMON_H) */ diff --git a/libobs/util/aarch/sse.h b/libobs/util/aarch/sse.h new file mode 100644 index 000000000..6f0788124 --- /dev/null +++ b/libobs/util/aarch/sse.h @@ -0,0 +1,2591 @@ +/* Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2017 Evan Nemerson + * 2015-2017 John W. Ratcliff + * 2015 Brandon Rowlett + * 2015 Ken Fast + */ + +#if !defined(SIMDE__SSE_H) +#if !defined(SIMDE__SSE_H) +#define SIMDE__SSE_H +#endif +#include "mmx.h" + +#if defined(SIMDE_SSE_NATIVE) +#undef SIMDE_SSE_NATIVE +#endif +#if defined(SIMDE_SSE_FORCE_NATIVE) +#define SIMDE_SSE_NATIVE +#elif defined(__SSE__) && !defined(SIMDE_SSE_NO_NATIVE) && \ + !defined(SIMDE_NO_NATIVE) +#define SIMDE_SSE_NATIVE +#elif defined(__ARM_NEON) && !defined(SIMDE_SSE_NO_NEON) && \ + !defined(SIMDE_NO_NEON) +#define SIMDE_SSE_NEON +#endif + +#if defined(SIMDE_SSE_NATIVE) && !defined(SIMDE_MMX_NATIVE) +#if defined(SIMDE_SSE_FORCE_NATIVE) +#error Native SSE support requires native MMX support +#else +#warning Native SSE support requires native MMX support, disabling +#undef SIMDE_SSE_NATIVE +#endif +#elif defined(SIMDE_SSE_NEON) && !defined(SIMDE_MMX_NEON) +#warning SSE3 NEON support requires MMX NEON support, disabling +#undef SIMDE_SSE3_NEON +#endif + +#if defined(SIMDE_SSE_NATIVE) +#include +#else +#if defined(SIMDE_SSE_NEON) +#include +#endif + +#if !defined(__INTEL_COMPILER) && defined(__STDC_VERSION__) && \ + (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__) +#include +#elif defined(_WIN32) +#include +#endif +#endif + +#include +#include + +#define SIMDE_ALIGN(alignment) __attribute__((aligned(alignment))) +SIMDE__BEGIN_DECLS + +typedef SIMDE_ALIGN(16) union { +#if defined(SIMDE__ENABLE_GCC_VEC_EXT) + int8_t i8 __attribute__((__vector_size__(16), __may_alias__)); + int16_t i16 __attribute__((__vector_size__(16), __may_alias__)); + int32_t i32 __attribute__((__vector_size__(16), __may_alias__)); + int64_t i64 __attribute__((__vector_size__(16), __may_alias__)); + uint8_t u8 __attribute__((__vector_size__(16), __may_alias__)); + uint16_t u16 __attribute__((__vector_size__(16), __may_alias__)); + uint32_t u32 __attribute__((__vector_size__(16), __may_alias__)); + uint64_t u64 __attribute__((__vector_size__(16), __may_alias__)); +#if defined(SIMDE__HAVE_INT128) + simde_int128 i128 __attribute__((__vector_size__(16), __may_alias__)); + simde_uint128 u128 __attribute__((__vector_size__(16), __may_alias__)); +#endif + simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__)); +#else + int8_t i8[16]; + int16_t i16[8]; + int32_t i32[4]; + int64_t i64[2]; + uint8_t u8[16]; + uint16_t u16[8]; + uint32_t u32[4]; + uint64_t u64[2]; +#if defined(SIMDE__HAVE_INT128) + simde_int128 i128[1]; + simde_uint128 u128[1]; +#endif + simde_float32 f32[4]; +#endif + +#if defined(SIMDE_SSE_NATIVE) + __m128 n; +#elif defined(SIMDE_SSE_NEON) + int8x16_t neon_i8; + int16x8_t neon_i16; + int32x4_t neon_i32; + int64x2_t neon_i64; + uint8x16_t neon_u8; + uint16x8_t neon_u16; + uint32x4_t neon_u32; + uint64x2_t neon_u64; + float32x4_t neon_f32; +#endif +} simde__m128; + +#if defined(SIMDE_SSE_NATIVE) +HEDLEY_STATIC_ASSERT(sizeof(__m128) == sizeof(simde__m128), + "__m128 size doesn't match simde__m128 size"); +SIMDE__FUNCTION_ATTRIBUTES simde__m128 SIMDE__M128_C(__m128 v) +{ + simde__m128 r; + r.n = v; + return r; +} +#elif defined(SIMDE_SSE_NEON) +#define SIMDE__M128_NEON_C(T, expr) \ + (simde__m128) { .neon_##T = expr } +#endif +HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128), "simde__m128 size incorrect"); + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_add_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_add_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_f32 = vaddq_f32(a.neon_f32, b.neon_f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.f32[i] = a.f32[i] + b.f32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_add_ss(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_add_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32_t b0 = vgetq_lane_f32(b.neon_f32, 0); + float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0); + /* the upper values in the result must be the remnants of . */ + r.neon_f32 = vaddq_f32(a.neon_f32, value); +#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) + r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, simde_mm_add_ps(a, b).f32, + 4, 1, 2, 3); +#else + r.f32[0] = a.f32[0] + b.f32[0]; + r.f32[1] = a.f32[1]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[3]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_and_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_and_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_i32 = vandq_s32(a.neon_i32, b.neon_i32); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { + r.i32[i] = a.i32[i] & b.i32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_andnot_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_andnot_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_i32 = vbicq_s32(b.neon_i32, a.neon_i32); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { + r.i32[i] = ~(a.i32[i]) & b.i32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_avg_pu16(simde__m64 a, simde__m64 b) +{ + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_avg_pu16(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_u16 = vrhadd_u16(b.neon_u16, a.neon_u16); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < 4; i++) { + r.u16[i] = (a.u16[i] + b.u16[i] + 1) >> 1; + } +#endif + + return r; +} +#define simde_m_pavgw(a, b) simde_mm_avg_pu16(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_avg_pu8(simde__m64 a, simde__m64 b) +{ + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_avg_pu8(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_u8 = vrhadd_u8(b.neon_u8, a.neon_u8); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < 8; i++) { + r.u8[i] = (a.u8[i] + b.u8[i] + 1) >> 1; + } +#endif + + return r; +} +#define simde_m_pavgb(a, b) simde_mm_avg_pu8(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmpeq_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpeq_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_u32 = vceqq_f32(a.neon_f32, b.neon_f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.u32[i] = (a.f32[i] == b.f32[i]) ? 0xffffffff : 0; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmpeq_ss(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpeq_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32x4_t s = + vreinterpretq_f32_u32(vceqq_f32(a.neon_f32, b.neon_f32)); + float32x4_t t = vextq_f32(a.neon_f32, s, 1); + r.neon_f32 = vextq_f32(t, t, 3); +#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) + r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, + simde_mm_cmpeq_ps(a, b).f32, 4, 1, 2, 3); +#else + r.u32[0] = (a.f32[0] == b.f32[0]) ? 0xffffffff : 0; + SIMDE__VECTORIZE + for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.u32[i] = a.u32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmpge_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpge_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_u32 = vcgeq_f32(a.neon_f32, b.neon_f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.u32[i] = (a.f32[i] >= b.f32[i]) ? 0xffffffff : 0; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmpge_ss(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) + r.n = _mm_cmpge_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32x4_t s = + vreinterpretq_f32_u32(vcgeq_f32(a.neon_f32, b.neon_f32)); + float32x4_t t = vextq_f32(a.neon_f32, s, 1); + r.neon_f32 = vextq_f32(t, t, 3); +#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) + r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, + simde_mm_cmpge_ps(a, b).f32, 4, 1, 2, 3); +#else + r.u32[0] = (a.f32[0] >= b.f32[0]) ? 0xffffffff : 0; + SIMDE__VECTORIZE + for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.u32[i] = a.u32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmpgt_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpgt_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_u32 = vcgtq_f32(a.neon_f32, b.neon_f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.u32[i] = (a.f32[i] > b.f32[i]) ? 0xffffffff : 0; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmpgt_ss(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) + r.n = _mm_cmpgt_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32x4_t s = + vreinterpretq_f32_u32(vcgtq_f32(a.neon_f32, b.neon_f32)); + float32x4_t t = vextq_f32(a.neon_f32, s, 1); + r.neon_f32 = vextq_f32(t, t, 3); +#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) + r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, + simde_mm_cmpgt_ps(a, b).f32, 4, 1, 2, 3); +#else + r.u32[0] = (a.f32[0] > b.f32[0]) ? 0xffffffff : 0; + SIMDE__VECTORIZE + for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.u32[i] = a.u32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmple_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmple_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_u32 = vcleq_f32(a.neon_f32, b.neon_f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.u32[i] = (a.f32[i] <= b.f32[i]) ? 0xffffffff : 0; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmple_ss(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmple_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32x4_t s = + vreinterpretq_f32_u32(vcleq_f32(a.neon_f32, b.neon_f32)); + float32x4_t t = vextq_f32(a.neon_f32, s, 1); + r.neon_f32 = vextq_f32(t, t, 3); +#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) + r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, + simde_mm_cmple_ps(a, b).f32, 4, 1, 2, 3); +#else + r.u32[0] = (a.f32[0] <= b.f32[0]) ? 0xffffffff : 0; + SIMDE__VECTORIZE + for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.u32[i] = a.u32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmplt_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmplt_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_u32 = vcltq_f32(a.neon_f32, b.neon_f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.u32[i] = (a.f32[i] < b.f32[i]) ? 0xffffffff : 0; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmplt_ss(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmplt_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32x4_t s = + vreinterpretq_f32_u32(vcltq_f32(a.neon_f32, b.neon_f32)); + float32x4_t t = vextq_f32(a.neon_f32, s, 1); + r.neon_f32 = vextq_f32(t, t, 3); +#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) + r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, + simde_mm_cmplt_ps(a, b).f32, 4, 1, 2, 3); +#else + r.u32[0] = (a.f32[0] < b.f32[0]) ? 0xffffffff : 0; + SIMDE__VECTORIZE + for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.u32[i] = a.u32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmpneq_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpneq_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_u32 = vmvnq_u32(vceqq_f32(a.neon_f32, b.neon_f32)); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.u32[i] = (a.f32[i] != b.f32[i]) ? 0xffffffff : 0; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmpneq_ss(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpneq_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32x4_t e = + vreinterpretq_f32_u32(vceqq_f32(a.neon_f32, b.neon_f32)); + float32x4_t s = + vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(e))); + float32x4_t t = vextq_f32(a.neon_f32, s, 1); + r.neon_f32 = vextq_f32(t, t, 3); +#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) + r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, + simde_mm_cmpneq_ps(a, b).f32, 4, 1, 2, 3); +#else + r.u32[0] = (a.f32[0] != b.f32[0]) ? 0xffffffff : 0; + SIMDE__VECTORIZE + for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.u32[i] = a.u32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmpnge_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpnge_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_u32 = vcltq_f32(a.neon_f32, b.neon_f32); +#else + r = simde_mm_cmplt_ps(a, b); +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmpnge_ss(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) + r.n = _mm_cmpnge_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32x4_t s = + vreinterpretq_f32_u32(vcltq_f32(a.neon_f32, b.neon_f32)); + float32x4_t t = vextq_f32(a.neon_f32, s, 1); + r.neon_f32 = vextq_f32(t, t, 3); +#else + r = simde_mm_cmplt_ss(a, b); +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmpngt_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpngt_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_u32 = vcleq_f32(a.neon_f32, b.neon_f32); +#else + r = simde_mm_cmple_ps(a, b); +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmpngt_ss(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) + r.n = _mm_cmpngt_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32x4_t s = + vreinterpretq_f32_u32(vcleq_f32(a.neon_f32, b.neon_f32)); + float32x4_t t = vextq_f32(a.neon_f32, s, 1); + r.neon_f32 = vextq_f32(t, t, 3); +#else + r = simde_mm_cmple_ss(a, b); +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmpnle_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpnle_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_u32 = vcgtq_f32(a.neon_f32, b.neon_f32); +#else + r = simde_mm_cmpgt_ps(a, b); +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmpnle_ss(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpnle_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32x4_t s = + vreinterpretq_f32_u32(vcgtq_f32(a.neon_f32, b.neon_f32)); + float32x4_t t = vextq_f32(a.neon_f32, s, 1); + r.neon_f32 = vextq_f32(t, t, 3); +#else + r = simde_mm_cmpgt_ss(a, b); +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmpnlt_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpnlt_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_u32 = vcgeq_f32(a.neon_f32, b.neon_f32); +#else + r = simde_mm_cmpge_ps(a, b); +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmpnlt_ss(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpnlt_ss(a.n, b.n); +#else + r = simde_mm_cmpge_ss(a, b); +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmpord_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpord_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + /* Note: NEON does not have ordered compare builtin + Need to compare a eq a and b eq b to check for NaN + Do AND of results to get final */ + uint32x4_t ceqaa = vceqq_f32(a.neon_f32, a.neon_f32); + uint32x4_t ceqbb = vceqq_f32(b.neon_f32, b.neon_f32); + r.neon_u32 = vandq_u32(ceqaa, ceqbb); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.u32[i] = (isnan(a.f32[i]) || isnan(b.f32[i])) ? 0 + : 0xffffffff; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmpord_ss(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpord_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + uint32x4_t ceqaa = vceqq_f32(a.neon_f32, a.neon_f32); + uint32x4_t ceqbb = vceqq_f32(b.neon_f32, b.neon_f32); + float32x4_t s = vreinterpretq_f32_u32(vandq_u32(ceqaa, ceqbb)); + float32x4_t t = vextq_f32(a.neon_f32, s, 1); + r.neon_f32 = vextq_f32(t, t, 3); +#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) + r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, + simde_mm_cmpord_ps(a, b).f32, 4, 1, 2, 3); +#else + r.u32[0] = (isnan(a.f32[0]) || isnan(b.f32[0])) ? 0 : 0xffffffff; + SIMDE__VECTORIZE + for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.f32[i] = a.f32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmpunord_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpunord_ps(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.u32[i] = (isnan(a.f32[i]) || isnan(b.f32[i])) ? 0xffffffff + : 0; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmpunord_ss(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) + r.n = _mm_cmpunord_ss(a.n, b.n); +#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) + r.f32 = SIMDE__SHUFFLE_VECTOR( + 32, 16, a.f32, simde_mm_cmpunord_ps(a, b).f32, 4, 1, 2, 3); +#else + r.u32[0] = (isnan(a.f32[0]) || isnan(b.f32[0])) ? 0xffffffff : 0; + SIMDE__VECTORIZE + for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.f32[i] = a.f32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_comieq_ss(simde__m128 a, simde__m128 b) +{ +#if defined(SIMDE_SSE_NATIVE) + return _mm_comieq_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32); + uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32); + uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); + uint32x4_t a_eq_b = vceqq_f32(a.neon_f32, b.neon_f32); + return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0) ? 1 : 0; +#else + return a.f32[0] == b.f32[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_comige_ss(simde__m128 a, simde__m128 b) +{ +#if defined(SIMDE_SSE_NATIVE) + return _mm_comige_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32); + uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_ge_b = vcgeq_f32(a.neon_f32, b.neon_f32); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 + : 0; +#else + return a.f32[0] >= b.f32[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_comigt_ss(simde__m128 a, simde__m128 b) +{ +#if defined(SIMDE_SSE_NATIVE) + return _mm_comigt_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32); + uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_gt_b = vcgtq_f32(a.neon_f32, b.neon_f32); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 + : 0; +#else + return a.f32[0] > b.f32[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_comile_ss(simde__m128 a, simde__m128 b) +{ +#if defined(SIMDE_SSE_NATIVE) + return _mm_comile_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32); + uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32); + uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); + uint32x4_t a_le_b = vcleq_f32(a.neon_f32, b.neon_f32); + return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0) ? 1 : 0; +#else + return a.f32[0] <= b.f32[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_comilt_ss(simde__m128 a, simde__m128 b) +{ +#if defined(SIMDE_SSE_NATIVE) + return _mm_comilt_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NATIVE) + uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32); + uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32); + uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); + uint32x4_t a_lt_b = vcltq_f32(a.neon_f32, b.neon_f32); + return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0) ? 1 : 0; +#else + return a.f32[0] < b.f32[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_comineq_ss(simde__m128 a, simde__m128 b) +{ +#if defined(SIMDE_SSE_NATIVE) + return _mm_comineq_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32); + uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a.neon_f32, b.neon_f32)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0) + ? 1 + : 0; +#else + return a.f32[0] != b.f32[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cvt_pi2ps(simde__m128 a, simde__m64 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvt_pi2ps(a.n, b.n); +#else + r.f32[0] = (simde_float32)b.i32[0]; + r.f32[1] = (simde_float32)b.i32[1]; + r.i32[2] = a.i32[2]; + r.i32[3] = a.i32[3]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_cvt_ps2pi(simde__m128 a) +{ + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvt_ps2pi(a.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { + r.i32[i] = (int32_t)a.f32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cvt_si2ss(simde__m128 a, int32_t b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvt_si2ss(a.n, b); +#else + r.f32[0] = (simde_float32)b; + r.i32[1] = a.i32[1]; + r.i32[2] = a.i32[2]; + r.i32[3] = a.i32[3]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +int32_t simde_mm_cvt_ss2si(simde__m128 a) +{ +#if defined(SIMDE_SSE_NATIVE) + return _mm_cvt_ss2si(a.n); +#else + return (int32_t)a.f32[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cvtpi16_ps(simde__m64 a) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvtpi16_ps(a.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.f32[i] = (simde_float32)a.i16[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cvtpi32_ps(simde__m128 a, simde__m64 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvtpi32_ps(a.n, b.n); +#else + r.f32[0] = (simde_float32)b.i32[0]; + r.f32[1] = (simde_float32)b.i32[1]; + r.i32[2] = a.i32[2]; + r.i32[3] = a.i32[3]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cvtpi32x2_ps(simde__m64 a, simde__m64 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvtpi32x2_ps(a.n, b.n); +#else + r.f32[0] = (simde_float32)a.i32[0]; + r.f32[1] = (simde_float32)a.i32[1]; + r.f32[2] = (simde_float32)b.i32[0]; + r.f32[3] = (simde_float32)b.i32[1]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cvtpi8_ps(simde__m64 a) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvtpi8_ps(a.n); +#else + r.f32[0] = (simde_float32)a.i8[0]; + r.f32[1] = (simde_float32)a.i8[1]; + r.f32[2] = (simde_float32)a.i8[2]; + r.f32[3] = (simde_float32)a.i8[3]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_cvtps_pi16(simde__m128 a) +{ + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvtps_pi16(a.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { + r.i16[i] = (int16_t)a.f32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_cvtps_pi32(simde__m128 a) +{ + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvtps_pi32(a.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { + r.i32[i] = (int32_t)a.f32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_cvtps_pi8(simde__m128 a) +{ + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvtps_pi8(a.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(a.f32) / sizeof(a.f32[0])); i++) { + r.i8[i] = (int8_t)a.f32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cvtpu16_ps(simde__m64 a) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvtpu16_ps(a.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.f32[i] = (simde_float32)a.u16[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cvtpu8_ps(simde__m64 a) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvtpu8_ps(a.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < 4; i++) { + r.f32[i] = (simde_float32)a.u8[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cvtsi32_ss(simde__m128 a, int32_t b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvtsi32_ss(a.n, b); +#else + r.f32[0] = (simde_float32)b; + SIMDE__VECTORIZE + for (size_t i = 1; i < 4; i++) { + r.i32[i] = a.i32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cvtsi64_ss(simde__m128 a, int64_t b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) +#if !defined(__PGI) + r.n = _mm_cvtsi64_ss(a.n, b); +#else + r.n = _mm_cvtsi64x_ss(a.n, b); +#endif +#else + r.f32[0] = (simde_float32)b; + SIMDE__VECTORIZE + for (size_t i = 1; i < 4; i++) { + r.i32[i] = a.i32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde_float32 simde_mm_cvtss_f32(simde__m128 a) +{ +#if defined(SIMDE_SSE_NATIVE) + return _mm_cvtss_f32(a.n); +#elif defined(SIMDE_SSE_NEON) + return vgetq_lane_f32(a.neon_f32, 0); +#else + return a.f32[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int32_t simde_mm_cvtss_si32(simde__m128 a) +{ +#if defined(SIMDE_SSE_NATIVE) + return _mm_cvtss_si32(a.n); +#else + return (int32_t)a.f32[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int64_t simde_mm_cvtss_si64(simde__m128 a) +{ +#if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) +#if !defined(__PGI) + return _mm_cvtss_si64(a.n); +#else + return _mm_cvtss_si64x(a.n); +#endif +#else + return (int64_t)a.f32[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_cvtt_ps2pi(simde__m128 a) +{ + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvtt_ps2pi(a.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.i32[i] = (int32_t)truncf(a.f32[i]); + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +int32_t simde_mm_cvtt_ss2si(simde__m128 a) +{ +#if defined(SIMDE_SSE_NATIVE) + return _mm_cvtt_ss2si(a.n); +#else + return (int32_t)truncf(a.f32[0]); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_cvttps_pi32(simde__m128 a) +{ + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvttps_pi32(a.n); +#else + r = simde_mm_cvtt_ps2pi(a); +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +int32_t simde_mm_cvttss_si32(simde__m128 a) +{ +#if defined(SIMDE_SSE_NATIVE) + return _mm_cvttss_si32(a.n); +#else + return (int32_t)truncf(a.f32[0]); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int64_t simde_mm_cvttss_si64(simde__m128 a) +{ +#if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) +#if defined(__PGI) + return _mm_cvttss_si64x(a.n); +#else + return _mm_cvttss_si64(a.n); +#endif +#else + return (int64_t)truncf(a.f32[0]); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_div_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_div_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32x4_t recip0 = vrecpeq_f32(b.neon_f32); + float32x4_t recip1 = vmulq_f32(recip0, vrecpsq_f32(recip0, b.neon_f32)); + r.neon_f32 = vmulq_f32(a.neon_f32, recip1); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.f32[i] = a.f32[i] / b.f32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_div_ss(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_div_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32_t value = vgetq_lane_f32(simde_mm_div_ps(a, b).neon_f32, 0); + r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0); +#else + r.f32[0] = a.f32[0] / b.f32[0]; + SIMDE__VECTORIZE + for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.f32[i] = a.f32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +int32_t simde_mm_extract_pi16(simde__m64 a, const int imm8) +{ + return a.u16[imm8]; +} +#if defined(SIMDE_SSE_NATIVE) +#define simde_mm_extract_pi16(a, imm8) _mm_extract_pi16(a.n, imm8) +#endif +#define simde_m_pextrw(a, imm8) simde_mm_extract_pi16(a.n, imm8) + +enum { +#if defined(SIMDE_SSE_NATIVE) + simde_MM_ROUND_NEAREST = _MM_ROUND_NEAREST, + simde_MM_ROUND_DOWN = _MM_ROUND_DOWN, + simde_MM_ROUND_UP = _MM_ROUND_UP, + simde_MM_ROUND_TOWARD_ZERO = _MM_ROUND_TOWARD_ZERO +#else + simde_MM_ROUND_NEAREST +#if defined(FE_TONEAREST) + = FE_TONEAREST +#endif + , + + simde_MM_ROUND_DOWN +#if defined(FE_DOWNWARD) + = FE_DOWNWARD +#endif + , + + simde_MM_ROUND_UP +#if defined(FE_UPWARD) + = FE_UPWARD +#endif + , + + simde_MM_ROUND_TOWARD_ZERO +#if defined(FE_TOWARDZERO) + = FE_TOWARDZERO +#endif +#endif +}; + +SIMDE__FUNCTION_ATTRIBUTES +unsigned int simde_MM_GET_ROUNDING_MODE(void) +{ +#if defined(SIMDE_SSE_NATIVE) + return _MM_GET_ROUNDING_MODE(); +#else + return fegetround(); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_MM_SET_ROUNDING_MODE(unsigned int a) +{ +#if defined(SIMDE_SSE_NATIVE) + _MM_SET_ROUNDING_MODE(a); +#else + fesetround((int)a); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_insert_pi16(simde__m64 a, int16_t i, const int imm8) +{ + simde__m64 r; + r.i64[0] = a.i64[0]; + r.i16[imm8] = i; + return r; +} +#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) +#define simde_mm_insert_pi16(a, i, imm8) \ + SIMDE__M64_C(_mm_insert_pi16((a).n, i, imm8)); +#endif +#define simde_m_pinsrw(a, i, imm8) \ + SIMDE__M64_C(simde_mm_insert_pi16((a).n, i, imm8)); + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_load_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) +{ + simde__m128 r; + + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_load_ps(mem_addr); +#elif defined(SIMDE_SSE_NEON) + r.neon_f32 = vld1q_f32(mem_addr); +#else + memcpy(&r, mem_addr, sizeof(r.f32)); +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_load_ps1(simde_float32 const *mem_addr) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_load_ps1(mem_addr); +#else + const simde_float32 v = *mem_addr; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { + r.f32[i] = v; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_load_ss(simde_float32 const *mem_addr) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_load_ss(mem_addr); +#elif defined(SIMDE_SSE_NEON) + r.neon_f32 = vsetq_lane_f32(*mem_addr, vdupq_n_f32(0), 0); +#else + r.f32[0] = *mem_addr; + r.i32[1] = 0; + r.i32[2] = 0; + r.i32[3] = 0; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_load1_ps(simde_float32 const *mem_addr) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_load1_ps(mem_addr); +#elif defined(SIMDE_SSE_NEON) + r.neon_f32 = vld1q_dup_f32(mem_addr); +#else + r = simde_mm_load_ps1(mem_addr); +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_loadh_pi(simde__m128 a, simde__m64 const *mem_addr) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_loadh_pi(a.n, (__m64 *)mem_addr); +#else + r.f32[0] = a.f32[0]; + r.f32[1] = a.f32[1]; + r.f32[2] = mem_addr->f32[0]; + r.f32[3] = mem_addr->f32[1]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_loadl_pi(simde__m128 a, simde__m64 const *mem_addr) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_loadl_pi(a.n, (__m64 *)mem_addr); +#else + r.f32[0] = mem_addr->f32[0]; + r.f32[1] = mem_addr->f32[1]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[3]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_loadr_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) +{ + simde__m128 r; + + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_loadr_ps(mem_addr); +#else + r.f32[0] = mem_addr[3]; + r.f32[1] = mem_addr[2]; + r.f32[2] = mem_addr[1]; + r.f32[3] = mem_addr[0]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_loadu_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_loadu_ps(mem_addr); +#elif defined(SIMDE_SSE_NEON) + r.neon_f32 = vld1q_f32(mem_addr); +#else + r.f32[0] = mem_addr[0]; + r.f32[1] = mem_addr[1]; + r.f32[2] = mem_addr[2]; + r.f32[3] = mem_addr[3]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_maskmove_si64(simde__m64 a, simde__m64 mask, char *mem_addr) +{ +#if defined(SIMDE_SSE_NATIVE) + _mm_maskmove_si64(a.n, mask.n, mem_addr); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(a.i8) / sizeof(a.i8[0])); i++) + if (mask.i8[i] < 0) + mem_addr[i] = a.i8[i]; +#endif +} +#define simde_m_maskmovq(a, mask, mem_addr) \ + simde_mm_maskmove_si64(a, mask, mem_addr) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_max_pi16(simde__m64 a, simde__m64 b) +{ + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_max_pi16(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { + r.i16[i] = (a.i16[i] > b.i16[i]) ? a.i16[i] : b.i16[i]; + } +#endif + + return r; +} +#define simde_m_pmaxsw(a, b) simde_mm_max_pi16(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_max_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_max_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_f32 = vmaxq_f32(a.neon_f32, b.neon_f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.f32[i] = (a.f32[i] > b.f32[i]) ? a.f32[i] : b.f32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_max_pu8(simde__m64 a, simde__m64 b) +{ + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_max_pu8(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) { + r.u8[i] = (a.u8[i] > b.u8[i]) ? a.u8[i] : b.u8[i]; + } +#endif + + return r; +} +#define simde_m_pmaxub(a, b) simde_mm_max_pu8(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_max_ss(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_max_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32_t value = vgetq_lane_f32(vmaxq_f32(a.neon_f32, b.neon_f32), 0); + r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0); +#else + r.f32[0] = (a.f32[0] > b.f32[0]) ? a.f32[0] : b.f32[0]; + r.f32[1] = a.f32[1]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[3]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_min_pi16(simde__m64 a, simde__m64 b) +{ + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_min_pi16(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { + r.i16[i] = (a.i16[i] < b.i16[i]) ? a.i16[i] : b.i16[i]; + } +#endif + + return r; +} +#define simde_m_pminsw(a, b) simde_mm_min_pi16(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_min_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_min_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_f32 = vminq_f32(a.neon_f32, b.neon_f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.f32[i] = (a.f32[i] < b.f32[i]) ? a.f32[i] : b.f32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_min_pu8(simde__m64 a, simde__m64 b) +{ + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_min_pu8(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) { + r.u8[i] = (a.u8[i] < b.u8[i]) ? a.u8[i] : b.u8[i]; + } +#endif + + return r; +} +#define simde_m_pminub(a, b) simde_mm_min_pu8(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_min_ss(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_min_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32_t value = vgetq_lane_f32(vminq_f32(a.neon_f32, b.neon_f32), 0); + r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0); +#else + r.f32[0] = (a.f32[0] < b.f32[0]) ? a.f32[0] : b.f32[0]; + r.f32[1] = a.f32[1]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[3]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_move_ss(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_move_ss(a.n, b.n); +#else + r.f32[0] = b.f32[0]; + r.f32[1] = a.f32[1]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[3]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_movehl_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_movehl_ps(a.n, b.n); +#else + r.f32[0] = b.f32[2]; + r.f32[1] = b.f32[3]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[3]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_movelh_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_movelh_ps(a.n, b.n); +#else + r.f32[0] = a.f32[0]; + r.f32[1] = a.f32[1]; + r.f32[2] = b.f32[0]; + r.f32[3] = b.f32[1]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_movemask_pi8(simde__m64 a) +{ +#if defined(SIMDE_SSE_NATIVE) + return _mm_movemask_pi8(a.n); +#else + int r = 0; + const size_t nmemb = sizeof(a.i8) / sizeof(a.i8[0]); + + SIMDE__VECTORIZE_REDUCTION(| : r) + for (size_t i = 0; i < nmemb; i++) { + r |= (a.u8[nmemb - 1 - i] >> 7) << (nmemb - 1 - i); + } + + return r; +#endif +} +#define simde_m_pmovmskb(a, b) simde_mm_movemask_pi8(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_movemask_ps(simde__m128 a) +{ +#if defined(SIMDE_SSE_NATIVE) + return _mm_movemask_ps(a.n); +#elif defined(SIMDE_SSE_NEON) + /* TODO: check to see if NEON version is faster than the portable version */ + static const uint32x4_t movemask = {1, 2, 4, 8}; + static const uint32x4_t highbit = {0x80000000, 0x80000000, 0x80000000, + 0x80000000}; + uint32x4_t t0 = a.neon_u32; + uint32x4_t t1 = vtstq_u32(t0, highbit); + uint32x4_t t2 = vandq_u32(t1, movemask); + uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2)); + return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1); +#else + int r = 0; + + SIMDE__VECTORIZE_REDUCTION(| : r) + for (size_t i = 0; i < sizeof(a.u32) / sizeof(a.u32[0]); i++) { + r |= (a.u32[i] >> ((sizeof(a.u32[i]) * CHAR_BIT) - 1)) << i; + } + + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_mul_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_mul_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_f32 = vmulq_f32(a.neon_f32, b.neon_f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.f32[i] = a.f32[i] * b.f32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_mul_ss(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_mul_ss(a.n, b.n); +#else + r.f32[0] = a.f32[0] * b.f32[0]; + r.f32[1] = a.f32[1]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[3]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_mulhi_pu16(simde__m64 a, simde__m64 b) +{ + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_mulhi_pu16(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) { + r.u16[i] = (a.u16[i] * b.u16[i]) >> 16; + } +#endif + + return r; +} +#define simde_m_pmulhuw(a, b) simde_mm_mulhi_pu16(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_or_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_or_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_i32 = vorrq_s32(a.neon_i32, b.neon_i32); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.u32) / sizeof(r.u32[0])); i++) { + r.u32[i] = a.u32[i] | b.u32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_prefetch(char const *p, int i) +{ + (void)p; + (void)i; +} +#if defined(SIMDE_SSE_NATIVE) +#define simde_mm_prefetch(p, i) _mm_prefetch(p, i) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_rcp_ps(simde__m128 a) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_rcp_ps(a.n); +#elif defined(SIMDE_SSE_NEON) + float32x4_t recip = vrecpeq_f32(a.neon_f32); + +#if !defined(SIMDE_MM_RCP_PS_ITERS) +#define SIMDE_MM_RCP_PS_ITERS SIMDE_ACCURACY_ITERS +#endif + + for (int i = 0; i < SIMDE_MM_RCP_PS_ITERS; ++i) { + recip = vmulq_f32(recip, vrecpsq_f32(recip, a.neon_f32)); + } + + r.neon_f32 = recip; +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.f32[i] = 1.0f / a.f32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_rcp_ss(simde__m128 a) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_rcp_ss(a.n); +#else + r.f32[0] = 1.0f / a.f32[0]; + r.f32[1] = a.f32[1]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[3]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_rsqrt_ps(simde__m128 a) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_rsqrt_ps(a.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_f32 = vrsqrteq_f32(a.neon_f32); +#elif defined(__STDC_IEC_559__) + /* http://h14s.p5r.org/2012/09/0x5f3759df.html?mwh=1 */ + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.i32[i] = INT32_C(0x5f3759df) - (a.i32[i] >> 1); + +#if SIMDE_ACCURACY_ITERS > 2 + const float half = SIMDE_FLOAT32_C(0.5) * a.f32[i]; + for (int ai = 2; ai < SIMDE_ACCURACY_ITERS; ai++) + r.f32[i] *= SIMDE_FLOAT32_C(1.5) - + (half * r.f32[i] * r.f32[i]); +#endif + } +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.f32[i] = 1.0f / sqrtf(a.f32[i]); + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_rsqrt_ss(simde__m128 a) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_rsqrt_ss(a.n); +#elif defined(__STDC_IEC_559__) + { + r.i32[0] = INT32_C(0x5f3759df) - (a.i32[0] >> 1); + +#if SIMDE_ACCURACY_ITERS > 2 + float half = SIMDE_FLOAT32_C(0.5) * a.f32[0]; + for (int ai = 2; ai < SIMDE_ACCURACY_ITERS; ai++) + r.f32[0] *= SIMDE_FLOAT32_C(1.5) - + (half * r.f32[0] * r.f32[0]); +#endif + } + r.f32[0] = 1.0f / sqrtf(a.f32[0]); + r.f32[1] = a.f32[1]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[3]; +#else + r.f32[0] = 1.0f / sqrtf(a.f32[0]); + r.f32[1] = a.f32[1]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[3]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_sad_pu8(simde__m64 a, simde__m64 b) +{ + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_sad_pu8(a.n, b.n); +#else + uint16_t sum = 0; + + SIMDE__VECTORIZE_REDUCTION(+ : sum) + for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) { + sum += (uint8_t)abs(a.u8[i] - b.u8[i]); + } + + r.i16[0] = sum; + r.i16[1] = 0; + r.i16[2] = 0; + r.i16[3] = 0; +#endif + + return r; +} +#define simde_m_psadbw(a, b) simde_mm_sad_pu8(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_set_ps(simde_float32 e3, simde_float32 e2, + simde_float32 e1, simde_float32 e0) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_set_ps(e3, e2, e1, e0); +#elif defined(SIMDE_SSE_NEON) + SIMDE_ALIGN(16) simde_float32 data[4] = {e0, e1, e2, e3}; + r.neon_f32 = vld1q_f32(data); +#else + r.f32[0] = e0; + r.f32[1] = e1; + r.f32[2] = e2; + r.f32[3] = e3; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_set_ps1(simde_float32 a) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_set1_ps(a); +#elif defined(SIMDE_SSE_NEON) + r.neon_f32 = vdupq_n_f32(a); +#else + r = simde_mm_set_ps(a, a, a, a); +#endif + + return r; +} +#define simde_mm_set1_ps(a) simde_mm_set_ps1(a) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_set_ss(simde_float32 a) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_set_ss(a); +#else + r = simde_mm_set_ps(0, 0, 0, a); +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_setr_ps(simde_float32 e3, simde_float32 e2, + simde_float32 e1, simde_float32 e0) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_setr_ps(e3, e2, e1, e0); +#elif defined(SIMDE_SSE_NEON) + SIMDE_ALIGN(16) simde_float32 data[4] = {e3, e2, e1, e0}; + r.neon_f32 = vld1q_f32(data); +#else + r = simde_mm_set_ps(e0, e1, e2, e3); +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_setzero_ps(void) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_setzero_ps(); +#elif defined(SIMDE_SSE_NEON) + r.neon_f32 = vdupq_n_f32(0.0f); +#else + r = simde_mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f); +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_sfence(void) +{ + /* TODO: Use Hedley. */ +#if defined(SIMDE_SSE_NATIVE) + _mm_sfence(); +#elif defined(__GNUC__) && \ + ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)) + __atomic_thread_fence(__ATOMIC_SEQ_CST); +#elif !defined(__INTEL_COMPILER) && defined(__STDC_VERSION__) && \ + (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__) +#if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ < 9) + __atomic_thread_fence(__ATOMIC_SEQ_CST); +#else + atomic_thread_fence(memory_order_seq_cst); +#endif +#elif defined(_MSC_VER) + MemoryBarrier(); +#elif defined(__GNUC__) && \ + ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)) + __atomic_thread_fence(__ATOMIC_SEQ_CST); +#elif HEDLEY_CLANG_HAS_FEATURE(c_atomic) + __c11_atomic_thread_fence(__ATOMIC_SEQ_CST) +#elif defined(__GNUC__) && \ + ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)) + __sync_synchronize(); +#elif (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x5140)) || \ + (defined(__SUNPRO_CC) && (__SUNPRO_CC >= 0x5140)) + __atomic_thread_fence(__ATOMIC_SEQ_CST); +#elif defined(_OPENMP) +#pragma omp critical(simde_mm_sfence_) + { + } +#endif +} + +#define SIMDE_MM_SHUFFLE(z, y, x, w) \ + (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_shuffle_pi16(simde__m64 a, const int imm8) +{ + simde__m64 r; + for (size_t i = 0; i < sizeof(r.u16) / sizeof(r.u16[0]); i++) { + r.i16[i] = a.i16[(imm8 >> (i * 2)) & 3]; + } + return r; +} +#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) +#define simde_mm_shuffle_pi16(a, imm8) SIMDE__M64_C(_mm_shuffle_pi16(a.n, imm8)) +#elif defined(SIMDE__SHUFFLE_VECTOR) +#define simde_mm_shuffle_pi16(a, imm8) \ + ({ \ + const simde__m64 simde__tmp_a_ = a; \ + (simde__m64){.i16 = SIMDE__SHUFFLE_VECTOR( \ + 16, 8, (simde__tmp_a_).i16, \ + (simde__tmp_a_).i16, (((imm8)) & 3), \ + (((imm8) >> 2) & 3), (((imm8) >> 4) & 3), \ + (((imm8) >> 6) & 3))}; \ + }) +#endif + +#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) +#define simde_m_pshufw(a, imm8) SIMDE__M64_C(_m_pshufw(a.n, imm8)) +#else +#define simde_m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_shuffle_ps(simde__m128 a, simde__m128 b, const int imm8) +{ + simde__m128 r; + r.f32[0] = a.f32[(imm8 >> 0) & 3]; + r.f32[1] = a.f32[(imm8 >> 2) & 3]; + r.f32[2] = b.f32[(imm8 >> 4) & 3]; + r.f32[3] = b.f32[(imm8 >> 6) & 3]; + return r; +} +#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) +#define simde_mm_shuffle_ps(a, b, imm8) \ + SIMDE__M128_C(_mm_shuffle_ps(a.n, b.n, imm8)) +#elif defined(SIMDE__SHUFFLE_VECTOR) +#define simde_mm_shuffle_ps(a, b, imm8) \ + ({ \ + (simde__m128){.f32 = SIMDE__SHUFFLE_VECTOR( \ + 32, 16, (a).f32, (b).f32, \ + (((imm8)) & 3), (((imm8) >> 2) & 3), \ + (((imm8) >> 4) & 3) + 4, \ + (((imm8) >> 6) & 3) + 4)}; \ + }) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_sqrt_ps(simde__m128 a) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_sqrt_ps(a.n); +#elif defined(SIMDE_SSE_NEON) + float32x4_t recipsq = vrsqrteq_f32(a.neon_f32); + float32x4_t sq = vrecpeq_f32(recipsq); + /* ??? use step versions of both sqrt and recip for better accuracy? */ + r.neon_f32 = sq; +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < sizeof(r.f32) / sizeof(r.f32[0]); i++) { + r.f32[i] = sqrtf(a.f32[i]); + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_sqrt_ss(simde__m128 a) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_sqrt_ss(a.n); +#elif defined(SIMDE_SSE_NEON) + float32_t value = vgetq_lane_f32(simde_mm_sqrt_ps(a).neon_f32, 0); + r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0); +#else + r.f32[0] = sqrtf(a.f32[0]); + r.f32[1] = a.f32[1]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[3]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_store_ps(simde_float32 mem_addr[4], simde__m128 a) +{ + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE_NATIVE) + _mm_store_ps(mem_addr, a.n); +#elif defined(SIMDE_SSE_NEON) + vst1q_f32(mem_addr, a.neon_f32); +#else + SIMDE__VECTORIZE_ALIGNED(mem_addr : 16) + for (size_t i = 0; i < sizeof(a.f32) / sizeof(a.f32[0]); i++) { + mem_addr[i] = a.f32[i]; + } +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_store_ps1(simde_float32 mem_addr[4], simde__m128 a) +{ + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE_NATIVE) + _mm_store_ps1(mem_addr, a.n); +#else + SIMDE__VECTORIZE_ALIGNED(mem_addr : 16) + for (size_t i = 0; i < sizeof(a.f32) / sizeof(a.f32[0]); i++) { + mem_addr[i] = a.f32[0]; + } +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_store_ss(simde_float32 *mem_addr, simde__m128 a) +{ +#if defined(SIMDE_SSE_NATIVE) + _mm_store_ss(mem_addr, a.n); +#elif defined(SIMDE_SSE_NEON) + vst1q_lane_f32(mem_addr, a.neon_f32, 0); +#else + *mem_addr = a.f32[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_store1_ps(simde_float32 mem_addr[4], simde__m128 a) +{ + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE_NATIVE) + _mm_store1_ps(mem_addr, a.n); +#else + simde_mm_store_ps1(mem_addr, a); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_storeh_pi(simde__m64 *mem_addr, simde__m128 a) +{ +#if defined(SIMDE_SSE_NATIVE) + _mm_storeh_pi(&(mem_addr->n), a.n); +#else + mem_addr->f32[0] = a.f32[2]; + mem_addr->f32[1] = a.f32[3]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_storel_pi(simde__m64 *mem_addr, simde__m128 a) +{ +#if defined(SIMDE_SSE_NATIVE) + _mm_storel_pi(&(mem_addr->n), a.n); +#else + mem_addr->f32[0] = a.f32[0]; + mem_addr->f32[1] = a.f32[1]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_storer_ps(simde_float32 mem_addr[4], simde__m128 a) +{ + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE_NATIVE) + _mm_storer_ps(mem_addr, a.n); +#else + SIMDE__VECTORIZE_ALIGNED(mem_addr : 16) + for (size_t i = 0; i < sizeof(a.f32) / sizeof(a.f32[0]); i++) { + mem_addr[i] = + a.f32[((sizeof(a.f32) / sizeof(a.f32[0])) - 1) - i]; + } +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_storeu_ps(simde_float32 mem_addr[4], simde__m128 a) +{ +#if defined(SIMDE_SSE_NATIVE) + _mm_storeu_ps(mem_addr, a.n); +#elif defined(SIMDE_SSE_NEON) + vst1q_f32(mem_addr, a.neon_f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < sizeof(a.f32) / sizeof(a.f32[0]); i++) { + mem_addr[i] = a.f32[i]; + } +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_sub_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_sub_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_f32 = vsubq_f32(a.neon_f32, b.neon_f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.f32[i] = a.f32[i] - b.f32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_sub_ss(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_sub_ss(a.n, b.n); +#else + r.f32[0] = a.f32[0] - b.f32[0]; + r.f32[1] = a.f32[1]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[3]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_ucomieq_ss(simde__m128 a, simde__m128 b) +{ +#if defined(SIMDE_SSE_NATIVE) + return _mm_ucomieq_ss(a.n, b.n); +#else + fenv_t envp; + int x = feholdexcept(&envp); + int r = a.f32[0] == b.f32[0]; + if (HEDLEY_LIKELY(x == 0)) + fesetenv(&envp); + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_ucomige_ss(simde__m128 a, simde__m128 b) +{ +#if defined(SIMDE_SSE_NATIVE) + return _mm_ucomige_ss(a.n, b.n); +#else + fenv_t envp; + int x = feholdexcept(&envp); + int r = a.f32[0] >= b.f32[0]; + if (HEDLEY_LIKELY(x == 0)) + fesetenv(&envp); + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_ucomigt_ss(simde__m128 a, simde__m128 b) +{ +#if defined(SIMDE_SSE_NATIVE) + return _mm_ucomigt_ss(a.n, b.n); +#else + fenv_t envp; + int x = feholdexcept(&envp); + int r = a.f32[0] > b.f32[0]; + if (HEDLEY_LIKELY(x == 0)) + fesetenv(&envp); + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_ucomile_ss(simde__m128 a, simde__m128 b) +{ +#if defined(SIMDE_SSE_NATIVE) + return _mm_ucomile_ss(a.n, b.n); +#else + fenv_t envp; + int x = feholdexcept(&envp); + int r = a.f32[0] <= b.f32[0]; + if (HEDLEY_LIKELY(x == 0)) + fesetenv(&envp); + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_ucomilt_ss(simde__m128 a, simde__m128 b) +{ +#if defined(SIMDE_SSE_NATIVE) + return _mm_ucomilt_ss(a.n, b.n); +#else + fenv_t envp; + int x = feholdexcept(&envp); + int r = a.f32[0] < b.f32[0]; + if (HEDLEY_LIKELY(x == 0)) + fesetenv(&envp); + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_ucomineq_ss(simde__m128 a, simde__m128 b) +{ +#if defined(SIMDE_SSE_NATIVE) + return _mm_ucomineq_ss(a.n, b.n); +#else + fenv_t envp; + int x = feholdexcept(&envp); + int r = a.f32[0] != b.f32[0]; + if (HEDLEY_LIKELY(x == 0)) + fesetenv(&envp); + return r; +#endif +} + +#if defined(SIMDE_SSE_NATIVE) +#if defined(__has_builtin) +#if __has_builtin(__builtin_ia32_undef128) +#define SIMDE__HAVE_UNDEFINED128 +#endif +#elif !defined(__PGI) && !defined(SIMDE_BUG_GCC_REV_208793) +#define SIMDE__HAVE_UNDEFINED128 +#endif +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_undefined_ps(void) +{ + simde__m128 r; + +#if defined(SIMDE__HAVE_UNDEFINED128) + r.n = _mm_undefined_ps(); +#else + r = simde_mm_setzero_ps(); +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_unpackhi_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_unpackhi_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32x2_t a1 = vget_high_f32(a.neon_f32); + float32x2_t b1 = vget_high_f32(b.neon_f32); + float32x2x2_t result = vzip_f32(a1, b1); + r.neon_f32 = vcombine_f32(result.val[0], result.val[1]); +#else + r.f32[0] = a.f32[2]; + r.f32[1] = b.f32[2]; + r.f32[2] = a.f32[3]; + r.f32[3] = b.f32[3]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_unpacklo_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_unpacklo_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32x2_t a1 = vget_low_f32(a.neon_f32); + float32x2_t b1 = vget_low_f32(b.neon_f32); + float32x2x2_t result = vzip_f32(a1, b1); + r.neon_f32 = vcombine_f32(result.val[0], result.val[1]); +#else + r.f32[0] = a.f32[0]; + r.f32[1] = b.f32[0]; + r.f32[2] = a.f32[1]; + r.f32[3] = b.f32[1]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_xor_ps(simde__m128 a, simde__m128 b) +{ + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_xor_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_i32 = veorq_s32(a.neon_i32, b.neon_i32); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.u32) / sizeof(r.u32[0])); i++) { + r.u32[i] = a.u32[i] ^ b.u32[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_stream_pi(simde__m64 *mem_addr, simde__m64 a) +{ +#if defined(SIMDE_SSE_NATIVE) + _mm_stream_pi(&(mem_addr->n), a.n); +#else + mem_addr->i64[0] = a.i64[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_stream_ps(simde_float32 mem_addr[4], simde__m128 a) +{ + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE_NATIVE) + _mm_stream_ps(mem_addr, a.n); +#else + SIMDE__ASSUME_ALIGNED(mem_addr, 16); + memcpy(mem_addr, &a, sizeof(a)); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +uint32_t simde_mm_getcsr(void) +{ +#if defined(SIMDE_SSE_NATIVE) + return _mm_getcsr(); +#else + uint32_t r = 0; + int rounding_mode = fegetround(); + + switch (rounding_mode) { + case FE_TONEAREST: + break; + case FE_UPWARD: + r |= 2 << 13; + break; + case FE_DOWNWARD: + r |= 1 << 13; + break; + case FE_TOWARDZERO: + r = 3 << 13; + break; + } + + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_setcsr(uint32_t a) +{ +#if defined(SIMDE_SSE_NATIVE) + _mm_setcsr(a); +#else + switch ((a >> 13) & 3) { + case 0: + fesetround(FE_TONEAREST); + break; + case 1: + fesetround(FE_DOWNWARD); + break; + case 2: + fesetround(FE_UPWARD); + break; + case 3: + fesetround(FE_TOWARDZERO); + break; + } +#endif +} + +#define SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ + do { \ + simde__m128 tmp3, tmp2, tmp1, tmp0; \ + tmp0 = simde_mm_unpacklo_ps((row0), (row1)); \ + tmp2 = simde_mm_unpacklo_ps((row2), (row3)); \ + tmp1 = simde_mm_unpackhi_ps((row0), (row1)); \ + tmp3 = simde_mm_unpackhi_ps((row2), (row3)); \ + row0 = simde_mm_movelh_ps(tmp0, tmp2); \ + row1 = simde_mm_movehl_ps(tmp2, tmp0); \ + row2 = simde_mm_movelh_ps(tmp1, tmp3); \ + row3 = simde_mm_movehl_ps(tmp3, tmp1); \ + } while (0) + +SIMDE__END_DECLS + +#endif /* !defined(SIMDE__SSE_H) */ diff --git a/libobs/util/aarch/sse2.h b/libobs/util/aarch/sse2.h new file mode 100644 index 000000000..caad0a4ed --- /dev/null +++ b/libobs/util/aarch/sse2.h @@ -0,0 +1,4197 @@ +/* Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2017 Evan Nemerson + * 2015-2017 John W. Ratcliff + * 2015 Brandon Rowlett + * 2015 Ken Fast + * 2017 Hasindu Gamaarachchi + * 2018 Jeff Daily + */ + +#if !defined(SIMDE__SSE2_H) +#if !defined(SIMDE__SSE2_H) +#define SIMDE__SSE2_H +#endif +#include "sse.h" + +#if defined(SIMDE_SSE2_NATIVE) +#undef SIMDE_SSE2_NATIVE +#endif +#if defined(SIMDE_SSE2_FORCE_NATIVE) +#define SIMDE_SSE2_NATIVE +#elif defined(__SSE2__) && !defined(SIMDE_SSE2_NO_NATIVE) && \ + !defined(SIMDE_NO_NATIVE) +#define SIMDE_SSE2_NATIVE +#elif defined(__ARM_NEON) && !defined(SIMDE_SSE2_NO_NEON) && \ + !defined(SIMDE_NO_NEON) +#define SIMDE_SSE2_NEON +#endif + +#if defined(SIMDE_SSE2_NATIVE) && !defined(SIMDE_SSE_NATIVE) +#if defined(SIMDE_SSE2_FORCE_NATIVE) +#error Native SSE2 support requires native SSE support +#else +#warning Native SSE2 support requires native SSE support, disabling +#undef SIMDE_SSE2_NATIVE +#endif +#elif defined(SIMDE_SSE2_NEON) && !defined(SIMDE_SSE_NEON) +#warning SSE2 NEON support requires SSE NEON support, disabling +#undef SIMDE_SSE_NEON +#endif + +#if defined(SIMDE_SSE2_NATIVE) +#include +#else +#if defined(SIMDE_SSE2_NEON) +#include +#endif +#endif + +#include +#include +#include + +#define vreinterpretq_m128i_s32(v) \ + (simde__m128i) { .neon_i32 = v } +#define vreinterpretq_m128i_u64(v) \ + (simde__m128i) { .neon_u64 = v } + +#define vreinterpretq_s32_m128i(a) a.neon_i32 +#define vreinterpretq_u64_m128i(a) a.neon_u64 + +SIMDE__BEGIN_DECLS + +typedef SIMDE_ALIGN(16) union { +#if defined(SIMDE__ENABLE_GCC_VEC_EXT) + int8_t i8 __attribute__((__vector_size__(16), __may_alias__)); + int16_t i16 __attribute__((__vector_size__(16), __may_alias__)); + int32_t i32 __attribute__((__vector_size__(16), __may_alias__)); + int64_t i64 __attribute__((__vector_size__(16), __may_alias__)); + uint8_t u8 __attribute__((__vector_size__(16), __may_alias__)); + uint16_t u16 __attribute__((__vector_size__(16), __may_alias__)); + uint32_t u32 __attribute__((__vector_size__(16), __may_alias__)); + uint64_t u64 __attribute__((__vector_size__(16), __may_alias__)); +#if defined(SIMDE__HAVE_INT128) + simde_int128 i128 __attribute__((__vector_size__(16), __may_alias__)); + simde_uint128 u128 __attribute__((__vector_size__(16), __may_alias__)); +#endif + simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__)); + simde_float64 f64 __attribute__((__vector_size__(16), __may_alias__)); +#else + int8_t i8[16]; + int16_t i16[8]; + int32_t i32[4]; + int64_t i64[2]; + uint8_t u8[16]; + uint16_t u16[8]; + uint32_t u32[4]; + uint64_t u64[2]; +#if defined(SIMDE__HAVE_INT128) + simde_int128 i128[1]; + simde_uint128 u128[1]; +#endif + simde_float32 f32[4]; + simde_float64 f64[2]; +#endif + +#if defined(SIMDE_SSE2_NATIVE) + __m128i n; +#elif defined(SIMDE_SSE2_NEON) + int8x16_t neon_i8; + int16x8_t neon_i16; + int32x4_t neon_i32; + int64x2_t neon_i64; + uint8x16_t neon_u8; + uint16x8_t neon_u16; + uint32x4_t neon_u32; + uint64x2_t neon_u64; + float32x4_t neon_f32; +#if defined(SIMDE_ARCH_AMD64) + float64x2_t neon_f64; +#endif +#endif +} simde__m128i; + +typedef SIMDE_ALIGN(16) union { +#if defined(SIMDE__ENABLE_GCC_VEC_EXT) + int8_t i8 __attribute__((__vector_size__(16), __may_alias__)); + int16_t i16 __attribute__((__vector_size__(16), __may_alias__)); + int32_t i32 __attribute__((__vector_size__(16), __may_alias__)); + int64_t i64 __attribute__((__vector_size__(16), __may_alias__)); + uint8_t u8 __attribute__((__vector_size__(16), __may_alias__)); + uint16_t u16 __attribute__((__vector_size__(16), __may_alias__)); + uint32_t u32 __attribute__((__vector_size__(16), __may_alias__)); + uint64_t u64 __attribute__((__vector_size__(16), __may_alias__)); + simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__)); + simde_float64 f64 __attribute__((__vector_size__(16), __may_alias__)); +#else + int8_t i8[16]; + int16_t i16[8]; + int32_t i32[4]; + int64_t i64[2]; + uint8_t u8[16]; + uint16_t u16[8]; + uint32_t u32[4]; + uint64_t u64[2]; + simde_float32 f32[4]; + simde_float64 f64[2]; +#endif + +#if defined(SIMDE_SSE2_NATIVE) + __m128d n; +#elif defined(SIMDE_SSE2_NEON) + int8x16_t neon_i8; + int16x8_t neon_i16; + int32x4_t neon_i32; + int64x2_t neon_i64; + uint8x16_t neon_u8; + uint16x8_t neon_u16; + uint32x4_t neon_u32; + uint64x2_t neon_u64; + float32x4_t neon_f32; +#if defined(SIMDE_ARCH_AMD64) + float64x2_t neon_f64; +#endif +#endif +} simde__m128d; + +#if defined(SIMDE_SSE2_NATIVE) +HEDLEY_STATIC_ASSERT(sizeof(__m128i) == sizeof(simde__m128i), + "__m128i size doesn't match simde__m128i size"); +HEDLEY_STATIC_ASSERT(sizeof(__m128d) == sizeof(simde__m128d), + "__m128d size doesn't match simde__m128d size"); +SIMDE__FUNCTION_ATTRIBUTES simde__m128i SIMDE__M128I_C(__m128i v) +{ + simde__m128i r; + r.n = v; + return r; +} +SIMDE__FUNCTION_ATTRIBUTES simde__m128d SIMDE__M128D_C(__m128d v) +{ + simde__m128d r; + r.n = v; + return r; +} +#elif defined(SIMDE_SSE_NEON) +#define SIMDE__M128I_NEON_C(T, expr) \ + (simde__m128i) { .neon_##T = expr } +#define SIMDE__M128D_NEON_C(T, expr) \ + (simde__m128d) { .neon_##T = expr } +#endif +HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i), "simde__m128i size incorrect"); +HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d), "simde__m128d size incorrect"); + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_add_epi8(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_add_epi8(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C(i8, vaddq_s8(a.neon_i8, b.neon_i8)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) { + r.i8[i] = a.i8[i] + b.i8[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_add_epi16(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_add_epi16(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C(i16, vaddq_s16(a.neon_i16, b.neon_i16)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { + r.i16[i] = a.i16[i] + b.i16[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_add_epi32(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_add_epi32(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C(i32, vaddq_s32(a.neon_i32, b.neon_i32)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { + r.i32[i] = a.i32[i] + b.i32[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_add_epi64(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_add_epi64(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C(i64, vaddq_s64(a.neon_i64, b.neon_i64)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { + r.i64[i] = a.i64[i] + b.i64[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_add_pd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_add_pd(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) && defined(SIMDE_ARCH_AMD64) + return SIMDE__M128I_NEON_C(f64, vaddq_f64(a.neon_f64, b.neon_f64)); +#else + simde__m128d r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { + r.f64[i] = a.f64[i] + b.f64[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_add_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_add_sd(a.n, b.n)); +#else + simde__m128d r; + r.f64[0] = a.f64[0] + b.f64[0]; + r.f64[1] = a.f64[1]; + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_add_si64(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M64_C(_mm_add_si64(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M64_NEON_C(i64, vadd_s64(a.neon_i64, b.neon_i64)); +#else + simde__m64 r; + r.i64[0] = a.i64[0] + b.i64[0]; + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_adds_epi8(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_adds_epi8(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C(i8, vqaddq_s8(a.neon_i8, b.neon_i8)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) { + if ((((b.i8[i]) > 0) && ((a.i8[i]) > (INT8_MAX - (b.i8[i]))))) { + r.i8[i] = INT8_MAX; + } else if ((((b.i8[i]) < 0) && + ((a.i8[i]) < (INT8_MIN - (b.i8[i]))))) { + r.i8[i] = INT8_MIN; + } else { + r.i8[i] = (a.i8[i]) + (b.i8[i]); + } + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_adds_epi16(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_adds_epi16(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C(i16, vqaddq_s16(a.neon_i16, b.neon_i16)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { + if ((((b.i16[i]) > 0) && + ((a.i16[i]) > (INT16_MAX - (b.i16[i]))))) { + r.i16[i] = INT16_MAX; + } else if ((((b.i16[i]) < 0) && + ((a.i16[i]) < (INT16_MIN - (b.i16[i]))))) { + r.i16[i] = INT16_MIN; + } else { + r.i16[i] = (a.i16[i]) + (b.i16[i]); + } + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_adds_epu8(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_adds_epu8(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C(u8, vqaddq_u8(a.neon_u8, b.neon_u8)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) { + r.u8[i] = ((UINT8_MAX - a.u8[i]) > b.u8[i]) + ? (a.u8[i] + b.u8[i]) + : UINT8_MAX; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_adds_epu16(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_adds_epu16(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C(u16, vqaddq_u16(a.neon_u16, b.neon_u16)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) { + r.u16[i] = ((UINT16_MAX - a.u16[i]) > b.u16[i]) + ? (a.u16[i] + b.u16[i]) + : UINT16_MAX; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_and_pd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_and_pd(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128D_NEON_C(i32, vandq_s32(a.neon_i32, b.neon_i32)); +#else + simde__m128d r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) { + r.u64[i] = a.u64[i] & b.u64[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_and_si128(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_and_si128(a.n, b.n)); +#elif defined(SIMDE_SSE_NEON) + return SIMDE__M128I_NEON_C(i32, vandq_s32(b.neon_i32, a.neon_i32)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { + r.i64[i] = a.i64[i] & b.i64[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_andnot_pd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_andnot_pd(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128D_NEON_C(i32, vbicq_s32(a.neon_i32, b.neon_i32)); +#else + simde__m128d r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) { + r.u64[i] = ~a.u64[i] & b.u64[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_andnot_si128(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_andnot_si128(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C(i32, vbicq_s32(b.neon_i32, a.neon_i32)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { + r.i64[i] = ~(a.i64[i]) & b.i64[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_avg_epu8(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_avg_epu8(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C(u8, vrhaddq_u8(b.neon_u8, a.neon_u8)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) { + r.u8[i] = (a.u8[i] + b.u8[i] + 1) >> 1; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_avg_epu16(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_avg_epu16(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C(u16, vrhaddq_u16(b.neon_u16, a.neon_u16)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) { + r.u16[i] = (a.u16[i] + b.u16[i] + 1) >> 1; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_bslli_si128(simde__m128i a, const int imm8) +{ + simde__m128i r; + + if (HEDLEY_UNLIKELY(imm8 > 15)) { + r.u64[0] = 0; + r.u64[1] = 0; + return r; + } + + const int s = imm8 * 8; + +#if defined(SIMDE__HAVE_INT128) + r.u128[0] = a.u128[0] << s; +#else + if (s < 64) { + r.u64[0] = (a.u64[0] << s); + r.u64[1] = (a.u64[1] << s) | (a.u64[0] >> (64 - s)); + } else { + r.u64[0] = 0; + r.u64[1] = a.u64[0] << (s - 64); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) +#define simde_mm_bslli_si128(a, imm8) SIMDE__M128I_C(_mm_slli_si128(a.n, imm8)) +#elif defined(SIMDE_SSE2_NEON) +#define simde_mm_bslli_si128(a, imm8) \ + SIMDE__M128I_NEON_C( \ + i8, \ + (((imm8) <= 0) ? ((a).neon_i8) \ + : (((imm8) > 15) ? (vdupq_n_s8(0)) \ + : (vextq_s8(vdupq_n_s8(0), \ + (a).neon_i8, \ + 16 - (imm8)))))) +#endif +#define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_bsrli_si128(simde__m128i a, const int imm8) +{ + simde__m128i r; + + if (HEDLEY_UNLIKELY(imm8 > 15)) { + r.u64[0] = 0; + r.u64[1] = 0; + return r; + } + + const int s = imm8 * 8; + +#if defined(SIMDE__HAVE_INT128) + r.u128[0] = a.u128[0] >> s; +#else + if (s < 64) { + r.u64[0] = (a.u64[0] >> s) | (a.u64[1] << (64 - s)); + r.u64[1] = (a.u64[1] >> s); + } else { + r.u64[0] = a.u64[1] >> (s - 64); + r.u64[1] = 0; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) +#define simde_mm_bsrli_si128(a, imm8) SIMDE__M128I_C(_mm_srli_si128(a.n, imm8)) +#elif defined(SIMDE_SSE2_NEON) +#define simde_mm_bsrli_si128(a, imm8) \ + SIMDE__M128I_NEON_C( \ + i8, \ + ((imm8) <= 0) \ + ? ((a).neon_i8) \ + : (((imm8) > 15) ? (vdupq_n_s8(0)) \ + : (vextq_s8((a).neon_i8, \ + vdupq_n_s8(0), (imm8))))) +#endif +#define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128(a, imm8) + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_clflush(void const *p) +{ +#if defined(SIMDE_SSE2_NATIVE) + _mm_clflush(p); +#else + (void)p; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_comieq_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return _mm_comieq_sd(a.n, b.n); +#else + return a.f64[0] == b.f64[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_comige_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return _mm_comige_sd(a.n, b.n); +#else + return a.f64[0] >= b.f64[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_comigt_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return _mm_comigt_sd(a.n, b.n); +#else + return a.f64[0] > b.f64[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_comile_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return _mm_comile_sd(a.n, b.n); +#else + return a.f64[0] <= b.f64[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_comilt_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return _mm_comilt_sd(a.n, b.n); +#else + return a.f64[0] < b.f64[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_comineq_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return _mm_comineq_sd(a.n, b.n); +#else + return a.f64[0] != b.f64[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_castpd_ps(simde__m128d a) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128_C(_mm_castpd_ps(a.n)); +#else + union { + simde__m128d pd; + simde__m128 ps; + } r; + r.pd = a; + return r.ps; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_castpd_si128(simde__m128d a) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_castpd_si128(a.n)); +#else + union { + simde__m128d pd; + simde__m128i si128; + } r; + r.pd = a; + return r.si128; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_castps_pd(simde__m128 a) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_castps_pd(a.n)); +#else + union { + simde__m128 ps; + simde__m128d pd; + } r; + r.ps = a; + return r.pd; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_castps_si128(simde__m128 a) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_castps_si128(a.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C(i32, a.neon_i32); +#else + union { + simde__m128 ps; + simde__m128i si128; + } r; + r.ps = a; + return r.si128; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_castsi128_pd(simde__m128i a) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_castsi128_pd(a.n)); +#else + union { + simde__m128i si128; + simde__m128d pd; + } r; + r.si128 = a; + return r.pd; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_castsi128_ps(simde__m128i a) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128_C(_mm_castsi128_ps(a.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128_NEON_C(f32, a.neon_f32); +#else + union { + simde__m128i si128; + simde__m128 ps; + } r; + r.si128 = a; + return r.ps; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_cmpeq_epi8(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_cmpeq_epi8(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C( + i8, vreinterpretq_s8_u8(vceqq_s8(b.neon_i8, a.neon_i8))); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) { + r.i8[i] = (a.i8[i] == b.i8[i]) ? 0xff : 0x00; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_cmpeq_epi16(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_cmpeq_epi16(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C( + i16, vreinterpretq_s16_u16(vceqq_s16(b.neon_i16, a.neon_i16))); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { + r.i16[i] = (a.i16[i] == b.i16[i]) ? 0xffff : 0x0000; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_cmpeq_epi32(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_cmpeq_epi32(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C( + i32, vreinterpretq_s32_u32(vceqq_s32(b.neon_i32, a.neon_i32))); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { + r.i32[i] = (a.i32[i] == b.i32[i]) ? 0xffffffff : 0x00000000; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cmpeq_pd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_cmpeq_pd(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128D_NEON_C( + i32, vreinterpretq_s32_u32( + vceqq_s32(vreinterpretq_s32_f32(b.neon_f32), + vreinterpretq_s32_f32(a.neon_f32)))); +#else + simde__m128d r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { + r.u64[i] = (a.f64[i] == b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cmpeq_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_cmpeq_sd(a.n, b.n)); +#else + simde__m128d r; + r.u64[0] = (a.f64[0] == b.f64[0]) ? ~UINT64_C(0) : 0; + r.u64[1] = a.u64[1]; + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cmpneq_pd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_cmpneq_pd(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128D_NEON_C(f32, + vreinterpretq_f32_u16(vmvnq_u16( + vceqq_s16(b.neon_i16, a.neon_i16)))); +#else + simde__m128d r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { + r.u64[i] = (a.f64[i] != b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cmpneq_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_cmpneq_sd(a.n, b.n)); +#else + simde__m128d r; + r.u64[0] = (a.f64[0] != b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r.u64[1] = a.u64[1]; + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_cmplt_epi8(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_cmplt_epi8(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C( + i8, vreinterpretq_s8_u8(vcltq_s8(a.neon_i8, b.neon_i8))); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) { + r.i8[i] = (a.i8[i] < b.i8[i]) ? 0xff : 0x00; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_cmplt_epi16(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_cmplt_epi16(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C( + i16, vreinterpretq_s16_u16(vcltq_s16(a.neon_i16, b.neon_i16))); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { + r.i16[i] = (a.i16[i] < b.i16[i]) ? 0xffff : 0x0000; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_cmplt_epi32(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_cmplt_epi32(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C( + i32, vreinterpretq_s32_u32(vcltq_s32(a.neon_i32, b.neon_i32))); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { + r.i32[i] = (a.i32[i] < b.i32[i]) ? 0xffffffff : 0x00000000; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cmplt_pd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_cmplt_pd(a.n, b.n)); +#else + simde__m128d r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { + r.u64[i] = (a.f64[i] < b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cmplt_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_cmplt_sd(a.n, b.n)); +#else + simde__m128d r; + r.u64[0] = (a.f64[0] < b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r.u64[1] = a.u64[1]; + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cmple_pd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_cmple_pd(a.n, b.n)); +#else + simde__m128d r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { + r.u64[i] = (a.f64[i] <= b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cmple_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_cmple_sd(a.n, b.n)); +#else + simde__m128d r; + r.u64[0] = (a.f64[0] <= b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r.u64[1] = a.u64[1]; + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_cmpgt_epi8(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_cmpgt_epi8(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C( + i8, vreinterpretq_s8_u8(vcgtq_s8(a.neon_i8, b.neon_i8))); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) { + r.i8[i] = (a.i8[i] > b.i8[i]) ? 0xff : 0x00; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_cmpgt_epi16(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_cmpgt_epi16(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C( + i16, vreinterpretq_s16_u16(vcgtq_s16(a.neon_i16, b.neon_i16))); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { + r.i16[i] = (a.i16[i] > b.i16[i]) ? 0xffff : 0x0000; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_cmpgt_epi32(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_cmpgt_epi32(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C( + i32, vreinterpretq_s32_u32(vcgtq_s32(a.neon_i32, b.neon_i32))); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { + r.i32[i] = (a.i32[i] > b.i32[i]) ? 0xffffffff : 0x00000000; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cmpgt_pd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_cmpgt_pd(a.n, b.n)); +#else + simde__m128d r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { + r.u64[i] = (a.f64[i] > b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cmpgt_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) + return SIMDE__M128D_C(_mm_cmpgt_sd(a.n, b.n)); +#else + simde__m128d r; + r.u64[0] = (a.f64[0] > b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r.u64[1] = a.u64[1]; + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cmpge_pd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_cmpge_pd(a.n, b.n)); +#else + simde__m128d r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { + r.u64[i] = (a.f64[i] >= b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cmpge_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) + return SIMDE__M128D_C(_mm_cmpge_sd(a.n, b.n)); +#else + simde__m128d r; + r.u64[0] = (a.f64[0] >= b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r.u64[1] = a.u64[1]; + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cmpnge_pd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_cmpnge_pd(a.n, b.n)); +#else + return simde_mm_cmplt_pd(a, b); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cmpnge_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) + return SIMDE__M128D_C(_mm_cmpnge_sd(a.n, b.n)); +#else + return simde_mm_cmplt_sd(a, b); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cmpnlt_pd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_cmpnlt_pd(a.n, b.n)); +#else + return simde_mm_cmpge_pd(a, b); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cmpnlt_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_cmpnlt_sd(a.n, b.n)); +#else + return simde_mm_cmpge_sd(a, b); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cmpnle_pd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_cmpnle_pd(a.n, b.n)); +#else + return simde_mm_cmpgt_pd(a, b); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cmpnle_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_cmpnle_sd(a.n, b.n)); +#else + return simde_mm_cmpgt_sd(a, b); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cmpord_pd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_cmpord_pd(a.n, b.n)); +#else + simde__m128d r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { + r.u64[i] = (!isnan(a.f64[i]) && !isnan(b.f64[i])) ? ~UINT64_C(0) + : UINT64_C(0); + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cmpord_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_cmpord_sd(a.n, b.n)); +#else + simde__m128d r; + r.u64[0] = (!isnan(a.f64[0]) && !isnan(b.f64[0])) ? ~UINT64_C(0) + : UINT64_C(0); + r.u64[1] = a.u64[1]; + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cmpunord_pd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_cmpunord_pd(a.n, b.n)); +#else + simde__m128d r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { + r.u64[i] = (isnan(a.f64[i]) || isnan(b.f64[i])) ? ~UINT64_C(0) + : UINT64_C(0); + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cmpunord_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_cmpunord_sd(a.n, b.n)); +#else + simde__m128d r; + r.u64[0] = (isnan(a.f64[0]) || isnan(b.f64[0])) ? ~UINT64_C(0) + : UINT64_C(0); + r.u64[1] = a.u64[1]; + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cvtepi32_pd(simde__m128i a) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_cvtepi32_pd(a.n)); +#else + simde__m128d r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { + r.f64[i] = (simde_float64)a.i32[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cvtepi32_ps(simde__m128i a) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128_C(_mm_cvtepi32_ps(a.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128_NEON_C(f32, vcvtq_f32_s32(a.neon_i32)); +#else + simde__m128 r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { + r.f32[i] = (simde_float32)a.i32[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_cvtpd_epi32(simde__m128d a) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_cvtpd_epi32(a.n)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { + r.i32[i] = (int32_t)a.f64[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_cvtpd_pi32(simde__m128d a) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M64_C(_mm_cvtpd_pi32(a.n)); +#else + simde__m64 r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { + r.i32[i] = (int32_t)a.f64[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cvtpd_ps(simde__m128d a) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128_C(_mm_cvtpd_ps(a.n)); +#else + simde__m128 r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(a.f64) / sizeof(a.f64[0])); i++) { + r.f32[i] = (simde_float32)a.f64[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cvtpi32_pd(simde__m64 a) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_cvtpi32_pd(a.n)); +#else + simde__m128d r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { + r.f64[i] = (simde_float64)a.i32[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_cvtps_epi32(simde__m128 a) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_cvtps_epi32(a.n)); +#elif defined(SIMDE_SSE2_NEON) +/* The default rounding mode on SSE is 'round to even', which ArmV7 + does not support! It is supported on ARMv8 however. */ +#if defined(SIMDE_ARCH_AARCH64) + return SIMDE__M128I_NEON_C(i32, vcvtnq_s32_f32(a.neon_f32)); +#else + uint32x4_t signmask = vdupq_n_u32(0x80000000); + float32x4_t half = vbslq_f32(signmask, a.neon_f32, + vdupq_n_f32(0.5f)); /* +/- 0.5 */ + int32x4_t r_normal = vcvtq_s32_f32( + vaddq_f32(a.neon_f32, half)); /* round to integer: [a + 0.5]*/ + int32x4_t r_trunc = + vcvtq_s32_f32(a.neon_f32); /* truncate to integer: [a] */ + int32x4_t plusone = vshrq_n_s32(vnegq_s32(r_trunc), 31); /* 1 or 0 */ + int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), + vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ + float32x4_t delta = vsubq_f32( + a.neon_f32, + vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ + uint32x4_t is_delta_half = + vceqq_f32(delta, half); /* delta == +/- 0.5 */ + return SIMDE__M128I_NEON_C(i32, + vbslq_s32(is_delta_half, r_even, r_normal)); +#endif +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { + r.i32[i] = (int32_t)a.f32[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cvtps_pd(simde__m128 a) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_cvtps_pd(a.n)); +#else + simde__m128d r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { + r.f64[i] = a.f32[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +double simde_mm_cvtsd_f64(simde__m128d a) +{ +#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) + return _mm_cvtsd_f64(a.n); +#else + return a.f64[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int32_t simde_mm_cvtsd_si32(simde__m128d a) +{ +#if defined(SIMDE_SSE2_NATIVE) + return _mm_cvtsd_si32(a.n); +#else + return (int32_t)a.f64[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int32_t simde_mm_cvtsd_si64(simde__m128d a) +{ +#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) +#if defined(__PGI) + return _mm_cvtsd_si64x(a.n); +#else + return _mm_cvtsd_si64(a.n); +#endif +#else + return (int32_t)a.f64[0]; +#endif +} +#define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cvtsd_ss(simde__m128 a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128_C(_mm_cvtsd_ss(a.n, b.n)); +#else + simde__m128 r; + + r.f32[0] = (simde_float32)b.f64[0]; + + SIMDE__VECTORIZE + for (size_t i = 1; i < (sizeof(r) / sizeof(r.i32[0])); i++) { + r.i32[i] = a.i32[i]; + } + + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int32_t simde_mm_cvtsi128_si32(simde__m128i a) +{ +#if defined(SIMDE_SSE2_NATIVE) + return _mm_cvtsi128_si32(a.n); +#elif defined(SIMDE_SSE2_NEON) + return vgetq_lane_s32(a.neon_i32, 0); +#else + return a.i32[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int64_t simde_mm_cvtsi128_si64(simde__m128i a) +{ +#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) +#if defined(__PGI) + return _mm_cvtsi128_si64x(a.n); +#else + return _mm_cvtsi128_si64(a.n); +#endif +#else + return a.i64[0]; +#endif +} +#define simde_mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64(a) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cvtsi32_sd(simde__m128d a, int32_t b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_cvtsi32_sd(a.n, b)); +#else + simde__m128d r; + + r.f64[0] = (simde_float64)b; + r.i64[1] = a.i64[1]; + + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_cvtsi32_si128(int32_t a) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cvtsi32_si128(a); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0); +#else + r.i32[0] = a; + r.i32[1] = 0; + r.i32[2] = 0; + r.i32[3] = 0; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cvtsi64_sd(simde__m128d a, int32_t b) +{ + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) +#if !defined(__PGI) + r.n = _mm_cvtsi64_sd(a.n, b); +#else + r.n = _mm_cvtsi64x_sd(a.n, b); +#endif +#else + r.f64[0] = (simde_float64)b; + r.f64[1] = a.f64[1]; +#endif + + return r; +} +#define simde_mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64(a, b) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_cvtsi64_si128(int64_t a) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) +#if !defined(__PGI) + r.n = _mm_cvtsi64_si128(a); +#else + r.n = _mm_cvtsi64x_si128(a); +#endif +#else + r.i64[0] = a; + r.i64[1] = 0; +#endif + + return r; +} +#define simde_mm_cvtsi64x_si128(a) simde_mm_cvtsi64_si128(a) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cvtss_sd(simde__m128d a, simde__m128 b) +{ + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cvtss_sd(a.n, b.n); +#else + r.f64[0] = b.f32[0]; + r.i64[1] = a.i64[1]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_cvttpd_epi32(simde__m128d a) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cvttpd_epi32(a.n); +#else + for (size_t i = 0; i < (sizeof(a.f64) / sizeof(a.f64[0])); i++) { + r.i32[i] = (int32_t)trunc(a.f64[i]); + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_cvttpd_pi32(simde__m128d a) +{ + simde__m64 r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cvttpd_pi32(a.n); +#else + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { + r.i32[i] = (int32_t)trunc(a.f64[i]); + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_cvttps_epi32(simde__m128 a) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cvttps_epi32(a.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vcvtq_s32_f32(a.neon_f32); +#else + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { + r.i32[i] = (int32_t)truncf(a.f32[i]); + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +int32_t simde_mm_cvttsd_si32(simde__m128d a) +{ +#if defined(SIMDE_SSE2_NATIVE) + return _mm_cvttsd_si32(a.n); +#else + return (int32_t)trunc(a.f64[0]); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int64_t simde_mm_cvttsd_si64(simde__m128d a) +{ +#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) +#if !defined(__PGI) + return _mm_cvttsd_si64(a.n); +#else + return _mm_cvttsd_si64x(a.n); +#endif +#else + return (int64_t)trunc(a.f64[0]); +#endif +} +#define simde_mm_cvttsd_si64x(a) simde_mm_cvttsd_si64(a) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_div_pd(simde__m128d a, simde__m128d b) +{ + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_div_pd(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { + r.f64[i] = a.f64[i] / b.f64[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_div_sd(simde__m128d a, simde__m128d b) +{ + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_div_sd(a.n, b.n); +#else + r.f64[0] = a.f64[0] / b.f64[0]; + r.f64[1] = a.f64[1]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +int32_t simde_mm_extract_epi16(simde__m128i a, const int imm8) +{ + return a.u16[imm8 & 7]; +} +#if defined(SIMDE_SSE2_NATIVE) && \ + (!defined(SIMDE__REALLY_GCC) || HEDLEY_GCC_VERSION_CHECK(4, 6, 0)) +#define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a.n, imm8) +#elif defined(SIMDE_SSE2_NEON) +#define simde_mm_extract_epi16(a, imm8) \ + (vgetq_lane_s16((a).neon_i16, (imm8)) & ((int32_t)UINT32_C(0x0000ffff))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_insert_epi16(simde__m128i a, int32_t i, const int imm8) +{ + a.u16[imm8 & 7] = (int16_t)i; + return a; +} +#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) +#define simde_mm_insert_epi16(a, i, imm8) \ + SIMDE__M128I_C(_mm_insert_epi16((a).n, (i), (imm8))) +#elif defined(SIMDE_SSE2_NEON) +#define simde_mm_insert_epi16(a, i, imm8) \ + SIMDE__M128I_NEON_C(i16, vsetq_lane_s16((i), a.neon_i16, (imm8))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_load_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) +{ + simde__m128d r; + + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_load_pd(mem_addr); +#elif defined(SIMDE_SSE2_NEON) + r.neon_u32 = vld1q_u32((uint32_t const *)mem_addr); +#else + SIMDE__ASSUME_ALIGNED(mem_addr, 16); + memcpy(&r, mem_addr, sizeof(r)); +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_load_pd1(simde_float64 const *mem_addr) +{ + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_load_pd1(mem_addr); +#else + r.f64[0] = *mem_addr; + r.f64[1] = *mem_addr; +#endif + + return r; +} +#define simde_mm_load1_pd(mem_addr) simde_mm_load_pd1(mem_addr) + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_load_sd(simde_float64 const *mem_addr) +{ + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_load_sd(mem_addr); +#else + memcpy(&r, mem_addr, sizeof(simde_float64)); + r.u64[1] = 0; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_load_si128(simde__m128i const *mem_addr) +{ + simde__m128i r; + + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_load_si128(&(mem_addr->n)); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vld1q_s32((int32_t const *)mem_addr); +#else + SIMDE__ASSUME_ALIGNED(mem_addr, 16); + memcpy(&r, mem_addr, sizeof(r)); +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_loadh_pd(simde__m128d a, simde_float64 const *mem_addr) +{ + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_loadh_pd(a.n, mem_addr); +#else + simde_float64 t; + memcpy(&t, mem_addr, sizeof(t)); + r.f64[0] = a.f64[0]; + r.f64[1] = t; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_loadl_epi64(simde__m128i const *mem_addr) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_loadl_epi64(&mem_addr->n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vcombine_s32(vld1_s32((int32_t const *)mem_addr), + vcreate_s32(0)); +#else + r.u64[0] = mem_addr->u64[0]; + r.u64[1] = 0; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_loadl_pd(simde__m128d a, simde_float64 const *mem_addr) +{ + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_loadl_pd(a.n, mem_addr); +#else + memcpy(&r, mem_addr, sizeof(simde_float64)); + r.u64[1] = a.u64[1]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_loadr_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) +{ + simde__m128d r; + + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_loadr_pd(mem_addr); +#else + SIMDE__ASSUME_ALIGNED(mem_addr, 16); + r.f64[0] = mem_addr[1]; + r.f64[1] = mem_addr[0]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_loadu_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) +{ + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_loadu_pd(mem_addr); +#else + simde_float64 l, h; + memcpy(&l, &mem_addr[0], sizeof(l)); + memcpy(&h, &mem_addr[1], sizeof(h)); + r.f64[0] = l; + r.f64[1] = h; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_loadu_si128(simde__m128i const *mem_addr) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_loadu_si128(&((*mem_addr).n)); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vld1q_s32((int32_t const *)mem_addr); +#else + memcpy(&r, mem_addr, sizeof(r)); +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_madd_epi16(simde__m128i a, simde__m128i b) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_madd_epi16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + int32x4_t pl = + vmull_s16(vget_low_s16(a.neon_i16), vget_low_s16(b.neon_i16)); + int32x4_t ph = + vmull_s16(vget_high_s16(a.neon_i16), vget_high_s16(b.neon_i16)); + int32x2_t rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl)); + int32x2_t rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph)); + r.neon_i32 = vcombine_s32(rl, rh); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r) / sizeof(r.i16[0])); i += 2) { + r.i32[i / 2] = + (a.i16[i] * b.i16[i]) + (a.i16[i + 1] * b.i16[i + 1]); + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_maskmoveu_si128(simde__m128i a, simde__m128i mask, + int8_t mem_addr[HEDLEY_ARRAY_PARAM(16)]) +{ +#if defined(SIMDE_SSE2_NATIVE) + _mm_maskmoveu_si128(a.n, mask.n, (char *)mem_addr); +#else + for (size_t i = 0; i < 16; i++) { + if (mask.u8[i] & 0x80) { + mem_addr[i] = a.i8[i]; + } + } +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int32_t simde_mm_movemask_epi8(simde__m128i a) +{ +#if defined(SIMDE_SSE2_NATIVE) + return _mm_movemask_epi8(a.n); +#elif defined(SIMDE_SSE2_NEON) + uint8x16_t input = a.neon_u8; + SIMDE_ALIGN(16) + static const int8_t xr[8] = {-7, -6, -5, -4, -3, -2, -1, 0}; + uint8x8_t mask_and = vdup_n_u8(0x80); + int8x8_t mask_shift = vld1_s8(xr); + + uint8x8_t lo = vget_low_u8(input); + uint8x8_t hi = vget_high_u8(input); + + lo = vand_u8(lo, mask_and); + lo = vshl_u8(lo, mask_shift); + + hi = vand_u8(hi, mask_and); + hi = vshl_u8(hi, mask_shift); + + lo = vpadd_u8(lo, lo); + lo = vpadd_u8(lo, lo); + lo = vpadd_u8(lo, lo); + + hi = vpadd_u8(hi, hi); + hi = vpadd_u8(hi, hi); + hi = vpadd_u8(hi, hi); + + return ((hi[0] << 8) | (lo[0] & 0xFF)); +#else + int32_t r = 0; + SIMDE__VECTORIZE_REDUCTION(| : r) + for (size_t i = 0; i < 16; i++) { + r |= (a.u8[15 - i] >> 7) << (15 - i); + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int32_t simde_mm_movemask_pd(simde__m128d a) +{ +#if defined(SIMDE_SSE2_NATIVE) + return _mm_movemask_pd(a.n); +#else + int32_t r = 0; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(a.u64) / sizeof(a.u64[0])); i++) { + r |= (a.u64[i] >> 63) << i; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_movepi64_pi64(simde__m128i a) +{ + simde__m64 r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_movepi64_pi64(a.n); +#else + r.i64[0] = a.i64[0]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_movpi64_epi64(simde__m64 a) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_movpi64_epi64(a.n); +#else + r.i64[0] = a.i64[0]; + r.i64[1] = 0; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_min_epi16(simde__m128i a, simde__m128i b) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_min_epi16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i16 = vminq_s16(a.neon_i16, b.neon_i16); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { + r.i16[i] = (a.i16[i] < b.i16[i]) ? a.i16[i] : b.i16[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_min_epu8(simde__m128i a, simde__m128i b) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_min_epu8(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_u8 = vminq_u8(a.neon_u8, b.neon_u8); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) { + r.u8[i] = (a.u8[i] < b.u8[i]) ? a.u8[i] : b.u8[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_min_pd(simde__m128d a, simde__m128d b) +{ + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_min_pd(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { + r.f64[i] = (a.f64[i] < b.f64[i]) ? a.f64[i] : b.f64[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_min_sd(simde__m128d a, simde__m128d b) +{ + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_min_sd(a.n, b.n); +#else + r.f64[0] = (a.f64[0] < b.f64[0]) ? a.f64[0] : b.f64[0]; + r.f64[1] = a.f64[1]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_max_epi16(simde__m128i a, simde__m128i b) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_max_epi16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i16 = vmaxq_s16(a.neon_i16, b.neon_i16); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { + r.i16[i] = (a.i16[i] > b.i16[i]) ? a.i16[i] : b.i16[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_max_epu8(simde__m128i a, simde__m128i b) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_max_epu8(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_u8 = vmaxq_u8(a.neon_u8, b.neon_u8); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) { + r.u8[i] = (a.u8[i] > b.u8[i]) ? a.u8[i] : b.u8[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_max_pd(simde__m128d a, simde__m128d b) +{ + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_max_pd(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { + r.f64[i] = (a.f64[i] > b.f64[i]) ? a.f64[i] : b.f64[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_max_sd(simde__m128d a, simde__m128d b) +{ + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_max_sd(a.n, b.n); +#else + r.f64[0] = (a.f64[0] > b.f64[0]) ? a.f64[0] : b.f64[0]; + r.f64[1] = a.f64[1]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_move_epi64(simde__m128i a) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_move_epi64(a.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i64 = vsetq_lane_s64(0, a.neon_i64, 1); +#else + r.i64[0] = a.i64[0]; + r.i64[1] = 0; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_move_sd(simde__m128d a, simde__m128d b) +{ + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_move_sd(a.n, b.n); +#else + r.f64[0] = b.f64[0]; + r.f64[1] = a.f64[1]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_mul_epu32(simde__m128i a, simde__m128i b) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_mul_epu32(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) { + r.u64[i] = ((uint64_t)a.u32[i * 2]) * ((uint64_t)b.u32[i * 2]); + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_x_mm_mul_epi64(simde__m128i a, simde__m128i b) +{ + simde__m128i r; + + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { + r.i64[i] = a.i64[i] * b.i64[i]; + } + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_x_mm_mod_epi64(simde__m128i a, simde__m128i b) +{ + simde__m128i r; + + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { + r.i64[i] = a.i64[i] % b.i64[i]; + } + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_mul_pd(simde__m128d a, simde__m128d b) +{ + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_mul_pd(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { + r.f64[i] = a.f64[i] * b.f64[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_mul_sd(simde__m128d a, simde__m128d b) +{ + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_mul_sd(a.n, b.n); +#else + r.f64[0] = a.f64[0] * b.f64[0]; + r.f64[1] = a.f64[1]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_mul_su32(simde__m64 a, simde__m64 b) +{ + simde__m64 r; + +#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) + r.n = _mm_mul_su32(a.n, b.n); +#else + r.u64[0] = ((uint64_t)a.u32[0]) * ((uint64_t)b.u32[0]); +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_mulhi_epi16(simde__m128i a, simde__m128i b) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_mulhi_epi16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + int16x4_t a3210 = vget_low_s16(a.neon_i16); + int16x4_t b3210 = vget_low_s16(b.neon_i16); + int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */ + int16x4_t a7654 = vget_high_s16(a.neon_i16); + int16x4_t b7654 = vget_high_s16(b.neon_i16); + int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */ + uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210), + vreinterpretq_u16_s32(ab7654)); + r.neon_u16 = rv.val[1]; +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { + r.u16[i] = (uint16_t)(((uint32_t)(((int32_t)a.i16[i]) * + ((int32_t)b.i16[i]))) >> + 16); + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_mulhi_epu16(simde__m128i a, simde__m128i b) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) + r.n = _mm_mulhi_epu16(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) { + r.u16[i] = (uint16_t)( + (((uint32_t)a.u16[i]) * ((uint32_t)b.u16[i])) >> 16); + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_mullo_epi16(simde__m128i a, simde__m128i b) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_mullo_epi16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i16 = vmulq_s16(a.neon_i16, b.neon_i16); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { + r.u16[i] = (uint16_t)(((uint32_t)(((int32_t)a.i16[i]) * + ((int32_t)b.i16[i]))) & + 0xffff); + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_or_pd(simde__m128d a, simde__m128d b) +{ + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_or_pd(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { + r.i64[i] = a.i64[i] | b.i64[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_or_si128(simde__m128i a, simde__m128i b) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_or_si128(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vorrq_s32(a.neon_i32, b.neon_i32); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { + r.i64[i] = a.i64[i] | b.i64[i]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_packs_epi16(simde__m128i a, simde__m128i b) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_packs_epi16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i8 = vcombine_s8(vqmovn_s16(a.neon_i16), vqmovn_s16(b.neon_i16)); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { + r.i8[i] = (a.i16[i] > INT8_MAX) + ? INT8_MAX + : ((a.i16[i] < INT8_MIN) + ? INT8_MIN + : ((int8_t)a.i16[i])); + r.i8[i + 8] = (b.i16[i] > INT8_MAX) + ? INT8_MAX + : ((b.i16[i] < INT8_MIN) + ? INT8_MIN + : ((int8_t)b.i16[i])); + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_packs_epi32(simde__m128i a, simde__m128i b) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_packs_epi32(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i16 = + vcombine_s16(vqmovn_s32(a.neon_i32), vqmovn_s32(b.neon_i32)); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { + r.i16[i] = (a.i32[i] > INT16_MAX) + ? INT16_MAX + : ((a.i32[i] < INT16_MIN) + ? INT16_MIN + : ((int16_t)a.i32[i])); + r.i16[i + 4] = (b.i32[i] > INT16_MAX) + ? INT16_MAX + : ((b.i32[i] < INT16_MIN) + ? INT16_MIN + : ((int16_t)b.i32[i])); + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_packus_epi16(simde__m128i a, simde__m128i b) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_packus_epi16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_u8 = + vcombine_u8(vqmovun_s16(a.neon_i16), vqmovun_s16(b.neon_i16)); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { + r.u8[i] = (a.i16[i] > UINT8_MAX) + ? UINT8_MAX + : ((a.i16[i] < 0) ? 0 : ((int8_t)a.i16[i])); + r.u8[i + 8] = + (b.i16[i] > UINT8_MAX) + ? UINT8_MAX + : ((b.i16[i] < 0) ? 0 : ((int8_t)b.i16[i])); + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_pause(void) +{ +#if defined(SIMDE_SSE2_NATIVE) + _mm_pause(); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_sad_epu8(simde__m128i a, simde__m128i b) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_sad_epu8(a.n, b.n); +#else + for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { + uint16_t tmp = 0; + SIMDE__VECTORIZE_REDUCTION(+ : tmp) + for (size_t j = 0; j < ((sizeof(r.u8) / sizeof(r.u8[0])) / 2); + j++) { + const size_t e = j + (i * 8); + tmp += (a.u8[e] > b.u8[e]) ? (a.u8[e] - b.u8[e]) + : (b.u8[e] - a.u8[e]); + } + r.i64[i] = tmp; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_set_epi8(int8_t e15, int8_t e14, int8_t e13, int8_t e12, + int8_t e11, int8_t e10, int8_t e9, int8_t e8, + int8_t e7, int8_t e6, int8_t e5, int8_t e4, + int8_t e3, int8_t e2, int8_t e1, int8_t e0) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, + e3, e2, e1, e0); +#else + r.i8[0] = e0; + r.i8[1] = e1; + r.i8[2] = e2; + r.i8[3] = e3; + r.i8[4] = e4; + r.i8[5] = e5; + r.i8[6] = e6; + r.i8[7] = e7; + r.i8[8] = e8; + r.i8[9] = e9; + r.i8[10] = e10; + r.i8[11] = e11; + r.i8[12] = e12; + r.i8[13] = e13; + r.i8[14] = e14; + r.i8[15] = e15; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_set_epi16(int16_t e7, int16_t e6, int16_t e5, int16_t e4, + int16_t e3, int16_t e2, int16_t e1, int16_t e0) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0); +#elif defined(SIMDE_SSE2_NEON) + SIMDE_ALIGN(16) int16_t data[8] = {e0, e1, e2, e3, e4, e5, e6, e7}; + r.neon_i16 = vld1q_s16(data); +#else + r.i16[0] = e0; + r.i16[1] = e1; + r.i16[2] = e2; + r.i16[3] = e3; + r.i16[4] = e4; + r.i16[5] = e5; + r.i16[6] = e6; + r.i16[7] = e7; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_set_epi32(int32_t e3, int32_t e2, int32_t e1, int32_t e0) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set_epi32(e3, e2, e1, e0); +#elif defined(SIMDE_SSE2_NEON) + SIMDE_ALIGN(16) int32_t data[4] = {e0, e1, e2, e3}; + r.neon_i32 = vld1q_s32(data); +#else + r.i32[0] = e0; + r.i32[1] = e1; + r.i32[2] = e2; + r.i32[3] = e3; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_set_epi64(simde__m64 e1, simde__m64 e0) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set_epi64(e1.n, e0.n); +#else + r.i64[0] = e0.i64[0]; + r.i64[1] = e1.i64[0]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_set_epi64x(int64_t e1, int64_t e0) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set_epi64x(e1, e0); +#elif defined(SIMDE_SSE2_NEON) + r = SIMDE__M128I_NEON_C(i64, + vcombine_s64(vdup_n_s64(e0), vdup_n_s64(e1))); +#else + r.i64[0] = e0; + r.i64[1] = e1; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_x_mm_set_epu8(uint8_t e15, uint8_t e14, uint8_t e13, + uint8_t e12, uint8_t e11, uint8_t e10, + uint8_t e9, uint8_t e8, uint8_t e7, uint8_t e6, + uint8_t e5, uint8_t e4, uint8_t e3, uint8_t e2, + uint8_t e1, uint8_t e0) +{ + simde__m128i r; + + r.u8[0] = e0; + r.u8[1] = e1; + r.u8[2] = e2; + r.u8[3] = e3; + r.u8[4] = e4; + r.u8[5] = e5; + r.u8[6] = e6; + r.u8[7] = e7; + r.u8[8] = e8; + r.u8[9] = e9; + r.u8[10] = e10; + r.u8[11] = e11; + r.u8[12] = e12; + r.u8[13] = e13; + r.u8[14] = e14; + r.u8[15] = e15; + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_x_mm_set_epu16(uint16_t e7, uint16_t e6, uint16_t e5, + uint16_t e4, uint16_t e3, uint16_t e2, + uint16_t e1, uint16_t e0) +{ + simde__m128i r; + + r.u16[0] = e0; + r.u16[1] = e1; + r.u16[2] = e2; + r.u16[3] = e3; + r.u16[4] = e4; + r.u16[5] = e5; + r.u16[6] = e6; + r.u16[7] = e7; + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_x_mm_set_epu32(uint32_t e3, uint32_t e2, uint32_t e1, + uint32_t e0) +{ + simde__m128i r; + + r.u32[0] = e0; + r.u32[1] = e1; + r.u32[2] = e2; + r.u32[3] = e3; + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_x_mm_set_epu64x(uint64_t e1, uint64_t e0) +{ + simde__m128i r; + + r.u64[0] = e0; + r.u64[1] = e1; + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_set_pd(simde_float64 e1, simde_float64 e0) +{ + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set_pd(e1, e0); +#else + r.f64[0] = e0; + r.f64[1] = e1; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_set_pd1(simde_float64 a) +{ + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set1_pd(a); +#else + r.f64[0] = a; + r.f64[1] = a; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_set_sd(simde_float64 a) +{ + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set_sd(a); +#else + r.f64[0] = a; + r.u64[1] = 0; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_set1_epi8(int8_t a) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set1_epi8(a); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i8 = vdupq_n_s8(a); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) { + r.i8[i] = a; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_set1_epi16(int16_t a) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set1_epi16(a); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i16 = vdupq_n_s16(a); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { + r.i16[i] = a; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_set1_epi32(int32_t a) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set1_epi32(a); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vdupq_n_s32(a); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { + r.i32[i] = a; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_set1_epi64x(int64_t a) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set1_epi64x(a); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i64 = vmovq_n_s64(a); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { + r.i64[i] = a; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_set1_epi64(simde__m64 a) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set1_epi64(a.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { + r.i64[i] = a.i64[0]; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_set1_pd(simde_float64 a) +{ + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set1_pd(a); +#else + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { + r.f64[i] = a; + } +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_setr_epi8(int8_t e15, int8_t e14, int8_t e13, int8_t e12, + int8_t e11, int8_t e10, int8_t e9, int8_t e8, + int8_t e7, int8_t e6, int8_t e5, int8_t e4, + int8_t e3, int8_t e2, int8_t e1, int8_t e0) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, + e4, e3, e2, e1, e0); +#elif defined(SIMDE_SSE2_NEON) + int8_t t[] = {e15, e14, e13, e12, e11, e10, e9, e8, + e7, e6, e5, e4, e3, e2, e1, e0}; + r.neon_i8 = vld1q_s8(t); +#else + r.i8[0] = e15; + r.i8[1] = e14; + r.i8[2] = e13; + r.i8[3] = e12; + r.i8[4] = e11; + r.i8[5] = e10; + r.i8[6] = e9; + r.i8[7] = e8; + r.i8[8] = e7; + r.i8[9] = e6; + r.i8[10] = e5; + r.i8[11] = e4; + r.i8[12] = e3; + r.i8[13] = e2; + r.i8[14] = e1; + r.i8[15] = e0; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_setr_epi16(int16_t e7, int16_t e6, int16_t e5, int16_t e4, + int16_t e3, int16_t e2, int16_t e1, int16_t e0) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0); +#elif defined(SIMDE_SSE2_NEON) + int16_t t[] = {e7, e6, e5, e4, e3, e2, e1, e0}; + r.neon_i16 = vld1q_s16(t); +#else + r.i16[0] = e7; + r.i16[1] = e6; + r.i16[2] = e5; + r.i16[3] = e4; + r.i16[4] = e3; + r.i16[5] = e2; + r.i16[6] = e1; + r.i16[7] = e0; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_setr_epi32(int32_t e3, int32_t e2, int32_t e1, int32_t e0) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_setr_epi32(e3, e2, e1, e0); +#elif defined(SIMDE_SSE2_NEON) + int32_t t[] = {e3, e2, e1, e0}; + r.neon_i32 = vld1q_s32(t); +#else + r.i32[0] = e3; + r.i32[1] = e2; + r.i32[2] = e1; + r.i32[3] = e0; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_setr_epi64(simde__m64 e1, simde__m64 e0) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_setr_epi64(e1.n, e0.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i64 = vcombine_s64(e1.neon_i64, e0.neon_i64); +#else + r.i64[0] = e1.i64[0]; + r.i64[1] = e0.i64[0]; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_setr_pd(simde_float64 e1, simde_float64 e0) +{ + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_setr_pd(e1, e0); +#else + r.f64[0] = e1; + r.f64[1] = e0; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_setzero_pd(void) +{ + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_setzero_pd(); +#else + r.u64[0] = 0; + r.u64[1] = 0; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_setzero_si128(void) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_setzero_si128(); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vdupq_n_s32(0); +#else + r.u64[0] = 0; + r.u64[1] = 0; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_shuffle_epi32(simde__m128i a, const int imm8) +{ + simde__m128i r; + + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { + r.i32[i] = a.i32[(imm8 >> (i * 2)) & 3]; + } + + return r; +} +#if defined(SIMDE_SSE2_NATIVE) +#define simde_mm_shuffle_epi32(a, imm8) \ + SIMDE__M128I_C(_mm_shuffle_epi32((a).n, (imm8))) +#elif defined(SIMDE__SHUFFLE_VECTOR) +#define simde_mm_shuffle_epi32(a, imm8) \ + ({ \ + const simde__m128i simde__tmp_a_ = a; \ + (simde__m128i){.i32 = SIMDE__SHUFFLE_VECTOR( \ + 32, 16, (simde__tmp_a_).i32, \ + (simde__tmp_a_).i32, ((imm8)) & 3, \ + ((imm8) >> 2) & 3, ((imm8) >> 4) & 3, \ + ((imm8) >> 6) & 3)}; \ + }) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_shuffle_pd(simde__m128d a, simde__m128d b, const int imm8) +{ + simde__m128d r; + + r.f64[0] = ((imm8 & 1) == 0) ? a.f64[0] : a.f64[1]; + r.f64[1] = ((imm8 & 2) == 0) ? b.f64[0] : b.f64[1]; + + return r; +} +#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) +#define simde_mm_shuffle_pd(a, b, imm8) \ + SIMDE__M128D_C(_mm_shuffle_pd((a).n, (b).n, (imm8))) +#elif defined(SIMDE__SHUFFLE_VECTOR) +#define simde_mm_shuffle_pd(a, b, imm8) \ + ({ \ + (simde__m128d){.f64 = SIMDE__SHUFFLE_VECTOR( \ + 64, 16, (a).f64, (b).f64, \ + (((imm8)) & 1), \ + (((imm8) >> 1) & 1) + 2)}; \ + }) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_shufflehi_epi16(simde__m128i a, const int imm8) +{ + simde__m128i r; + + r.i64[0] = a.i64[0]; + for (size_t i = 4; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { + r.i16[i] = a.i16[((imm8 >> ((i - 4) * 2)) & 3) + 4]; + } + + return r; +} +#if defined(SIMDE_SSE2_NATIVE) +#define simde_mm_shufflehi_epi16(a, imm8) \ + SIMDE__M128I_C(_mm_shufflehi_epi16((a).n, (imm8))) +#elif defined(SIMDE__SHUFFLE_VECTOR) +#define simde_mm_shufflehi_epi16(a, imm8) \ + ({ \ + const simde__m128i simde__tmp_a_ = a; \ + (simde__m128i){.i16 = SIMDE__SHUFFLE_VECTOR( \ + 16, 16, (simde__tmp_a_).i16, \ + (simde__tmp_a_).i16, 0, 1, 2, 3, \ + (((imm8)) & 3) + 4, \ + (((imm8) >> 2) & 3) + 4, \ + (((imm8) >> 4) & 3) + 4, \ + (((imm8) >> 6) & 3) + 4)}; \ + }) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_shufflelo_epi16(simde__m128i a, const int imm8) +{ + simde__m128i r; + + for (size_t i = 0; i < ((sizeof(r.i16) / sizeof(r.i16[0])) / 2); i++) { + r.i16[i] = a.i16[((imm8 >> (i * 2)) & 3)]; + } + r.i64[1] = a.i64[1]; + + return r; +} +#if defined(SIMDE_SSE2_NATIVE) +#define simde_mm_shufflelo_epi16(a, imm8) \ + SIMDE__M128I_C(_mm_shufflelo_epi16((a).n, (imm8))) +#elif defined(SIMDE__SHUFFLE_VECTOR) +#define simde_mm_shufflelo_epi16(a, imm8) \ + ({ \ + const simde__m128i simde__tmp_a_ = a; \ + (simde__m128i){.i16 = SIMDE__SHUFFLE_VECTOR( \ + 16, 16, (simde__tmp_a_).i16, \ + (simde__tmp_a_).i16, (((imm8)) & 3), \ + (((imm8) >> 2) & 3), \ + (((imm8) >> 4) & 3), \ + (((imm8) >> 6) & 3), 4, 5, 6, 7)}; \ + }) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_sll_epi16(simde__m128i a, simde__m128i count) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_sll_epi16(a.n, count.n)); +#else + simde__m128i r; + + if (count.u64[0] > 15) + return simde_mm_setzero_si128(); + const int s = (int)(count.u64[0]); + + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) { + r.u16[i] = a.u16[i] << s; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_sll_epi32(simde__m128i a, simde__m128i count) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_sll_epi32(a.n, count.n)); +#else + simde__m128i r; + + if (count.u64[0] > 31) + return simde_mm_setzero_si128(); + const int s = (int)(count.u64[0]); + + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { + r.i32[i] = a.i32[i] << s; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_sll_epi64(simde__m128i a, simde__m128i count) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_sll_epi64(a.n, count.n)); +#else + simde__m128i r; + + if (count.u64[0] > 63) + return simde_mm_setzero_si128(); + const int s = (int)(count.u64[0]); + + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { + r.i64[i] = a.i64[i] << s; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_sqrt_pd(simde__m128d a) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_sqrt_pd(a.n)); +#else + simde__m128d r; + + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { + r.f64[i] = sqrt(a.f64[i]); + } + + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_sqrt_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_sqrt_sd(a.n, b.n)); +#else + simde__m128d r; + r.f64[0] = sqrt(b.f64[0]); + r.f64[1] = a.f64[1]; + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_srl_epi16(simde__m128i a, simde__m128i count) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_srl_epi16(a.n, count.n)); +#else + simde__m128i r; + + if (count.u64[0] > 15) + return simde_mm_setzero_si128(); + const int s = (int)(count.u64[0]); + + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) { + r.u16[i] = a.u16[i] >> s; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_srl_epi32(simde__m128i a, simde__m128i count) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_srl_epi32(a.n, count.n)); +#else + simde__m128i r; + + if (count.u64[0] > 31) + return simde_mm_setzero_si128(); + const int s = (int)(count.u64[0]); + + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.u32) / sizeof(r.u32[0])); i++) { + r.u32[i] = a.u32[i] >> s; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_srl_epi64(simde__m128i a, simde__m128i count) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_srl_epi64(a.n, count.n)); +#else + simde__m128i r; + + if (count.u64[0] > 31) + return simde_mm_setzero_si128(); + const int s = (int)(count.u64[0]); + + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) { + r.u64[i] = a.u64[i] >> s; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_srai_epi16(simde__m128i a, int imm8) +{ + simde__m128i r; + + const uint16_t m = + (uint16_t)((~0U) << ((sizeof(int16_t) * CHAR_BIT) - imm8)); + + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r) / sizeof(r.u16[0])); i++) { + const uint16_t is_neg = ((uint16_t)( + ((a.u16[i]) >> ((sizeof(int16_t) * CHAR_BIT) - 1)))); + r.u16[i] = (a.u16[i] >> imm8) | (m * is_neg); + } + + return r; +} +#if defined(SIMDE_SSE2_NATIVE) +#define simde_mm_srai_epi16(a, imm8) \ + SIMDE__M128I_C(_mm_srai_epi16((a).n, (imm8))); +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_srai_epi32(simde__m128i a, int imm8) +{ + simde__m128i r; + + const uint32_t m = + (uint32_t)((~0U) << ((sizeof(int) * CHAR_BIT) - imm8)); + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r) / sizeof(r.u32[0])); i++) { + uint32_t is_neg = ((uint32_t)( + ((a.u32[i]) >> ((sizeof(int32_t) * CHAR_BIT) - 1)))); + r.u32[i] = (a.u32[i] >> imm8) | (m * is_neg); + } + + return r; +} +#if defined(SIMDE_SSE2_NATIVE) +#define simde_mm_srai_epi32(a, imm8) \ + SIMDE__M128I_C(_mm_srai_epi32((a).n, (imm8))) +#elif defined(SIMDE_SSE2_NEON) +#define simde_mm_srai_epi32(a, imm8) \ + SIMDE__M128I_NEON_C( \ + i32, \ + ((imm8) <= 0) \ + ? (a.neon_i32) \ + : (((imm8) > 31) \ + ? (vshrq_n_s32(vshrq_n_s32(a.neon_i32, 16), \ + 16)) \ + : (vshrq_n_s32(a.neon_i32, (imm8))))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_sra_epi16(simde__m128i a, simde__m128i count) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_sra_epi16(a.n, count.n)); +#else + simde__m128i r; + int cnt = (int)count.i64[0]; + + if (cnt > 15 || cnt < 0) { + for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); + i++) { + r.u16[i] = (a.i16[i] < 0) ? 0xffff : 0x0000; + } + } else { + const uint16_t m = (uint16_t)( + (~0U) << ((sizeof(int16_t) * CHAR_BIT) - cnt)); + for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); + i++) { + const uint16_t is_neg = a.i16[i] < 0; + r.u16[i] = (a.u16[i] >> cnt) | (m * is_neg); + } + } + + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_sra_epi32(simde__m128i a, simde__m128i count) +{ +#if defined(SIMDE_SSE2_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_SRA_EPI32) + return SIMDE__M128I_C(_mm_sra_epi32(a.n, count.n)); +#else + simde__m128i r; + const uint64_t cnt = count.u64[0]; + + if (cnt > 31) { + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); + i++) { + r.u32[i] = (a.i32[i] < 0) ? UINT32_MAX : 0; + } + } else if (cnt == 0) { + memcpy(&r, &a, sizeof(r)); + } else { + const uint32_t m = (uint32_t)( + (~0U) << ((sizeof(int32_t) * CHAR_BIT) - cnt)); + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); + i++) { + const uint32_t is_neg = a.i32[i] < 0; + r.u32[i] = (a.u32[i] >> cnt) | (m * is_neg); + } + } + + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_slli_epi16(simde__m128i a, const int imm8) +{ + simde__m128i r; + const int s = (imm8 > ((int)sizeof(r.i16[0]) * CHAR_BIT) - 1) ? 0 + : imm8; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { + r.i16[i] = a.i16[i] << s; + } + return r; +} +#if defined(SIMDE_SSE2_NATIVE) +#define simde_mm_slli_epi16(a, imm8) SIMDE__M128I_C(_mm_slli_epi16(a.n, imm8)); +#elif defined(SIMDE_SSE2_NEON) +#define simde_mm_slli_epi16(a, imm8) \ + SIMDE__M128I_NEON_C( \ + i16, ((imm8) <= 0) \ + ? ((a).neon_i16) \ + : (((imm8) > 31) ? (vdupq_n_s16(0)) \ + : (vshlq_n_s16((a).neon_i16, \ + (imm8))))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_slli_epi32(simde__m128i a, const int imm8) +{ + simde__m128i r; + const int s = (imm8 > ((int)sizeof(r.i32[0]) * CHAR_BIT) - 1) ? 0 + : imm8; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { + r.i32[i] = a.i32[i] << s; + } + return r; +} +#if defined(SIMDE_SSE2_NATIVE) +#define simde_mm_slli_epi32(a, imm8) SIMDE__M128I_C(_mm_slli_epi32(a.n, imm8)); +#elif defined(SIMDE_SSE2_NEON) +#define simde_mm_slli_epi32(a, imm8) \ + SIMDE__M128I_NEON_C( \ + i32, ((imm8) <= 0) \ + ? ((a).neon_i32) \ + : (((imm8) > 31) ? (vdupq_n_s32(0)) \ + : (vshlq_n_s32((a).neon_i32, \ + (imm8))))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_slli_epi64(simde__m128i a, const int imm8) +{ + simde__m128i r; + const int s = (imm8 > ((int)sizeof(r.i64[0]) * CHAR_BIT) - 1) ? 0 + : imm8; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { + r.i64[i] = a.i64[i] << s; + } + return r; +} +#if defined(SIMDE_SSE2_NATIVE) +#define simde_mm_slli_epi64(a, imm8) SIMDE__M128I_C(_mm_slli_epi64(a.n, imm8)); +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_srli_epi16(simde__m128i a, const int imm8) +{ + simde__m128i r; + const int s = (imm8 > ((int)sizeof(r.i16[0]) * CHAR_BIT) - 1) ? 0 + : imm8; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { + r.u16[i] = a.u16[i] >> s; + } + return r; +} +#if defined(SIMDE_SSE2_NATIVE) +#define simde_mm_srli_epi16(a, imm8) SIMDE__M128I_C(_mm_srli_epi16(a.n, imm8)); +#elif defined(SIMDE_SSE2_NEON) +#define simde_mm_srli_epi16(a, imm8) \ + SIMDE__M128I_NEON_C( \ + u16, ((imm8) <= 0) \ + ? ((a).neon_u16) \ + : (((imm8) > 31) ? (vdupq_n_u16(0)) \ + : (vshrq_n_u16((a).neon_u16, \ + (imm8))))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_srli_epi32(simde__m128i a, const int imm8) +{ + simde__m128i r; + const int s = (imm8 > ((int)sizeof(r.i32[0]) * CHAR_BIT) - 1) ? 0 + : imm8; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { + r.u32[i] = a.u32[i] >> s; + } + return r; +} +#if defined(SIMDE_SSE2_NATIVE) +#define simde_mm_srli_epi32(a, imm8) SIMDE__M128I_C(_mm_srli_epi32(a.n, imm8)) +#elif defined(SIMDE_SSE2_NEON) +#define simde_mm_srli_epi32(a, imm8) \ + SIMDE__M128I_NEON_C( \ + u32, ((imm8) <= 0) \ + ? ((a).neon_u32) \ + : (((imm8) > 31) ? (vdupq_n_u32(0)) \ + : (vshrq_n_u32((a).neon_u32, \ + (imm8))))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_srli_epi64(simde__m128i a, const int imm8) +{ + simde__m128i r; + const unsigned char s = imm8 & 255; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { + if (s > 63) { + r.u64[i] = 0; + } else { + r.u64[i] = a.u64[i] >> s; + } + } + return r; +} +#if defined(SIMDE_SSE2_NATIVE) +#define simde_mm_srli_epi64(a, imm8) SIMDE__M128I_C(_mm_srli_epi64(a.n, imm8)) +#elif defined(SIMDE_SSE2_NEON) +#define simde_mm_srli_epi64(a, imm8) \ + SIMDE__M128I_NEON_C( \ + u64, \ + (((imm8)&255) < 0 || ((imm8)&255) > 63) \ + ? (vdupq_n_u64(0)) \ + : ((((imm8)&255) == 0) \ + ? (a.neon_u64) \ + : (vshrq_n_u64((a).neon_u64, (imm8)&255)))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_store_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], + simde__m128d a) +{ + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE2_NATIVE) + _mm_store_pd(mem_addr, a.n); +#else + SIMDE__ASSUME_ALIGNED(mem_addr, 16); + memcpy(mem_addr, &a, sizeof(a)); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_store1_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], + simde__m128d a) +{ + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE2_NATIVE) + _mm_store1_pd(mem_addr, a.n); +#else + SIMDE__ASSUME_ALIGNED(mem_addr, 16); + mem_addr[0] = a.f64[0]; + mem_addr[1] = a.f64[0]; +#endif +} +#define simde_mm_store_pd1(mem_addr, a) simde_mm_store1_pd(mem_addr, a) + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_store_sd(simde_float64 *mem_addr, simde__m128d a) +{ +#if defined(SIMDE_SSE2_NATIVE) + _mm_store_sd(mem_addr, a.n); +#else + memcpy(mem_addr, &a, sizeof(a.f64[0])); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_store_si128(simde__m128i *mem_addr, simde__m128i a) +{ +#if defined(SIMDE_SSE2_NATIVE) + _mm_store_si128(&mem_addr->n, a.n); +#elif defined(SIMDE_SSE2_NEON) + vst1q_s32((int32_t *)mem_addr, a.neon_i32); +#else + SIMDE__ASSUME_ALIGNED(mem_addr, 16); + memcpy(mem_addr, &a, sizeof(a)); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_storeh_pd(simde_float64 *mem_addr, simde__m128d a) +{ +#if defined(SIMDE_SSE2_NATIVE) + _mm_storeh_pd(mem_addr, a.n); +#else + *mem_addr = a.f64[1]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_storel_epi64(simde__m128i *mem_addr, simde__m128i a) +{ +#if defined(SIMDE_SSE2_NATIVE) + _mm_storel_epi64(&(mem_addr->n), a.n); +#elif defined(SIMDE_SSE2_NEON) + mem_addr->i64[0] = vgetq_lane_s64(a.neon_i64, 0); +#else + mem_addr->i64[0] = a.i64[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_storel_pd(simde_float64 *mem_addr, simde__m128d a) +{ +#if defined(SIMDE_SSE2_NATIVE) + _mm_storel_pd(mem_addr, a.n); +#else + *mem_addr = a.f64[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_storer_pd(simde_float64 mem_addr[2], simde__m128d a) +{ + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE2_NATIVE) + _mm_storer_pd(mem_addr, a.n); +#else + SIMDE__ASSUME_ALIGNED(mem_addr, 16); + mem_addr[0] = a.f64[1]; + mem_addr[1] = a.f64[0]; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_storeu_pd(simde_float64 *mem_addr, simde__m128d a) +{ +#if defined(SIMDE_SSE2_NATIVE) + _mm_storeu_pd(mem_addr, a.n); +#else + memcpy(mem_addr, &a, sizeof(a)); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_storeu_si128(simde__m128i *mem_addr, simde__m128i a) +{ +#if defined(SIMDE_SSE2_NATIVE) + _mm_storeu_si128(&mem_addr->n, a.n); +#elif defined(SIMDE_SSE2_NEON) + int32_t v[4]; + vst1q_s32(v, a.neon_i32); + memcpy(mem_addr, v, sizeof(v)); +#else + memcpy(mem_addr, &a, sizeof(a)); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_stream_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], + simde__m128d a) +{ +#if defined(SIMDE_SSE2_NATIVE) + _mm_stream_pd(mem_addr, a.n); +#else + SIMDE__ASSUME_ALIGNED(mem_addr, 16); + memcpy(mem_addr, &a, sizeof(a)); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_stream_si128(simde__m128i *mem_addr, simde__m128i a) +{ +#if defined(SIMDE_SSE2_NATIVE) + _mm_stream_si128(&mem_addr->n, a.n); +#else + SIMDE__ASSUME_ALIGNED(mem_addr, 16); + memcpy(mem_addr, &a, sizeof(a)); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_stream_si32(int32_t *mem_addr, int32_t a) +{ +#if defined(SIMDE_SSE2_NATIVE) + _mm_stream_si32(mem_addr, a); +#else + *mem_addr = a; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_stream_si64(int64_t *mem_addr, int64_t a) +{ +#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) +#if defined(SIMDE__REALLY_GCC) && !HEDLEY_GCC_VERSION_CHECK(4, 8, 0) + *mem_addr = a; +#elif defined(__GNUC__) + _mm_stream_si64((long long *)mem_addr, a); +#else + _mm_stream_si64(mem_addr, a); +#endif +#else + *mem_addr = a; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_sub_epi8(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_sub_epi8(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C(i8, vsubq_s8(a.neon_i8, b.neon_i8)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) { + r.i8[i] = a.i8[i] - b.i8[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_sub_epi16(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_sub_epi16(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C(i16, vsubq_s16(a.neon_i16, b.neon_i16)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { + r.i16[i] = a.i16[i] - b.i16[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_sub_epi32(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_sub_epi32(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C(i32, vsubq_s32(a.neon_i32, b.neon_i32)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { + r.i32[i] = a.i32[i] - b.i32[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_sub_epi64(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_sub_epi64(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C(i64, vsubq_s64(a.neon_i64, b.neon_i64)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { + r.i64[i] = a.i64[i] - b.i64[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_sub_pd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_sub_pd(a.n, b.n)); +#else + simde__m128d r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { + r.f64[i] = a.f64[i] - b.f64[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_sub_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_sub_sd(a.n, b.n)); +#else + simde__m128d r; + r.f64[0] = a.f64[0] - b.f64[0]; + r.f64[1] = a.f64[1]; + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 simde_mm_sub_si64(simde__m64 a, simde__m64 b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M64_C(_mm_sub_si64(a.n, b.n)); +#else + simde__m64 r; + r.i64[0] = a.i64[0] - b.i64[0]; + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_subs_epi8(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_subs_epi8(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C(i8, vqsubq_s8(a.neon_i8, b.neon_i8)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r) / sizeof(r.i8[0])); i++) { + if (((b.i8[i]) > 0 && (a.i8[i]) < INT8_MIN + (b.i8[i]))) { + r.i8[i] = INT8_MIN; + } else if ((b.i8[i]) < 0 && (a.i8[i]) > INT8_MAX + (b.i8[i])) { + r.i8[i] = INT8_MAX; + } else { + r.i8[i] = (a.i8[i]) - (b.i8[i]); + } + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_subs_epi16(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_subs_epi16(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C(i16, vqsubq_s16(a.neon_i16, b.neon_i16)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r) / sizeof(r.i16[0])); i++) { + if (((b.i16[i]) > 0 && (a.i16[i]) < INT16_MIN + (b.i16[i]))) { + r.i16[i] = INT16_MIN; + } else if ((b.i16[i]) < 0 && + (a.i16[i]) > INT16_MAX + (b.i16[i])) { + r.i16[i] = INT16_MAX; + } else { + r.i16[i] = (a.i16[i]) - (b.i16[i]); + } + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_subs_epu8(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_subs_epu8(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C(u8, vqsubq_u8(a.neon_u8, b.neon_u8)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r) / sizeof(r.i8[0])); i++) { + const int32_t x = a.u8[i] - b.u8[i]; + if (x < 0) { + r.u8[i] = 0; + } else if (x > UINT8_MAX) { + r.u8[i] = UINT8_MAX; + } else { + r.u8[i] = (uint8_t)x; + } + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_subs_epu16(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_subs_epu16(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C(u16, vqsubq_u16(a.neon_u16, b.neon_u16)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r) / sizeof(r.i16[0])); i++) { + const int32_t x = a.u16[i] - b.u16[i]; + if (x < 0) { + r.u16[i] = 0; + } else if (x > UINT16_MAX) { + r.u16[i] = UINT16_MAX; + } else { + r.u16[i] = (uint16_t)x; + } + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_ucomieq_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return _mm_ucomieq_sd(a.n, b.n); +#else + fenv_t envp; + int x = feholdexcept(&envp); + int r = a.f64[0] == b.f64[0]; + if (HEDLEY_LIKELY(x == 0)) + fesetenv(&envp); + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_ucomige_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return _mm_ucomige_sd(a.n, b.n); +#else + fenv_t envp; + int x = feholdexcept(&envp); + int r = a.f64[0] >= b.f64[0]; + if (HEDLEY_LIKELY(x == 0)) + fesetenv(&envp); + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_ucomigt_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return _mm_ucomigt_sd(a.n, b.n); +#else + fenv_t envp; + int x = feholdexcept(&envp); + int r = a.f64[0] > b.f64[0]; + if (HEDLEY_LIKELY(x == 0)) + fesetenv(&envp); + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_ucomile_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return _mm_ucomile_sd(a.n, b.n); +#else + fenv_t envp; + int x = feholdexcept(&envp); + int r = a.f64[0] <= b.f64[0]; + if (HEDLEY_LIKELY(x == 0)) + fesetenv(&envp); + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_ucomilt_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return _mm_ucomilt_sd(a.n, b.n); +#else + fenv_t envp; + int x = feholdexcept(&envp); + int r = a.f64[0] < b.f64[0]; + if (HEDLEY_LIKELY(x == 0)) + fesetenv(&envp); + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +int simde_mm_ucomineq_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return _mm_ucomineq_sd(a.n, b.n); +#else + fenv_t envp; + int x = feholdexcept(&envp); + int r = a.f64[0] != b.f64[0]; + if (HEDLEY_LIKELY(x == 0)) + fesetenv(&envp); + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_undefined_pd(void) +{ + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128) + r.n = _mm_undefined_pd(); +#else + r = simde_mm_setzero_pd(); +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_undefined_si128(void) +{ + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128) + r.n = _mm_undefined_si128(); +#else + r = simde_mm_setzero_si128(); +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_lfence(void) +{ +#if defined(SIMDE_SSE2_NATIVE) + _mm_lfence(); +#else + simde_mm_sfence(); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +void simde_mm_mfence(void) +{ +#if defined(SIMDE_SSE2_NATIVE) + _mm_mfence(); +#else + simde_mm_sfence(); +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_unpackhi_epi8(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_unpackhi_epi8(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(a.neon_i16)); + int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b.neon_i16)); + int8x8x2_t result = vzip_s8(a1, b1); + return SIMDE__M128I_NEON_C(i8, + vcombine_s8(result.val[0], result.val[1])); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i8[0])) / 2); i++) { + r.i8[(i * 2)] = a.i8[i + ((sizeof(r) / sizeof(r.i8[0])) / 2)]; + r.i8[(i * 2) + 1] = + b.i8[i + ((sizeof(r) / sizeof(r.i8[0])) / 2)]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_unpackhi_epi16(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_unpackhi_epi16(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + int16x4_t a1 = vget_high_s16(a.neon_i16); + int16x4_t b1 = vget_high_s16(b.neon_i16); + int16x4x2_t result = vzip_s16(a1, b1); + return SIMDE__M128I_NEON_C(i16, + vcombine_s16(result.val[0], result.val[1])); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i16[0])) / 2); i++) { + r.i16[(i * 2)] = + a.i16[i + ((sizeof(r) / sizeof(r.i16[0])) / 2)]; + r.i16[(i * 2) + 1] = + b.i16[i + ((sizeof(r) / sizeof(r.i16[0])) / 2)]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_unpackhi_epi32(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_unpackhi_epi32(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + int32x2_t a1 = vget_high_s32(a.neon_i32); + int32x2_t b1 = vget_high_s32(b.neon_i32); + int32x2x2_t result = vzip_s32(a1, b1); + return SIMDE__M128I_NEON_C(i32, + vcombine_s32(result.val[0], result.val[1])); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i32[0])) / 2); i++) { + r.i32[(i * 2)] = + a.i32[i + ((sizeof(r) / sizeof(r.i32[0])) / 2)]; + r.i32[(i * 2) + 1] = + b.i32[i + ((sizeof(r) / sizeof(r.i32[0])) / 2)]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_unpackhi_epi64(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_unpackhi_epi64(a.n, b.n)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i64[0])) / 2); i++) { + r.i64[(i * 2)] = + a.i64[i + ((sizeof(r) / sizeof(r.i64[0])) / 2)]; + r.i64[(i * 2) + 1] = + b.i64[i + ((sizeof(r) / sizeof(r.i64[0])) / 2)]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_unpackhi_pd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_unpackhi_pd(a.n, b.n)); +#else + simde__m128d r; + SIMDE__VECTORIZE + for (size_t i = 0; i < ((sizeof(r) / sizeof(r.f64[0])) / 2); i++) { + r.f64[(i * 2)] = + a.f64[i + ((sizeof(r) / sizeof(r.f64[0])) / 2)]; + r.f64[(i * 2) + 1] = + b.f64[i + ((sizeof(r) / sizeof(r.f64[0])) / 2)]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_unpacklo_epi8(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_unpacklo_epi8(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(a.neon_i16)); + int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b.neon_i16)); + int8x8x2_t result = vzip_s8(a1, b1); + return SIMDE__M128I_NEON_C(i8, + vcombine_s8(result.val[0], result.val[1])); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i8[0])) / 2); i++) { + r.i8[(i * 2)] = a.i8[i]; + r.i8[(i * 2) + 1] = b.i8[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_unpacklo_epi16(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_unpacklo_epi16(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + int16x4_t a1 = vget_low_s16(a.neon_i16); + int16x4_t b1 = vget_low_s16(b.neon_i16); + int16x4x2_t result = vzip_s16(a1, b1); + return SIMDE__M128I_NEON_C(i16, + vcombine_s16(result.val[0], result.val[1])); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i16[0])) / 2); i++) { + r.i16[(i * 2)] = a.i16[i]; + r.i16[(i * 2) + 1] = b.i16[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_unpacklo_epi32(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_unpacklo_epi32(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + int32x2_t a1 = vget_low_s32(a.neon_i32); + int32x2_t b1 = vget_low_s32(b.neon_i32); + int32x2x2_t result = vzip_s32(a1, b1); + return SIMDE__M128I_NEON_C(i32, + vcombine_s32(result.val[0], result.val[1])); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i32[0])) / 2); i++) { + r.i32[(i * 2)] = a.i32[i]; + r.i32[(i * 2) + 1] = b.i32[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_unpacklo_epi64(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_unpacklo_epi64(a.n, b.n)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i64[0])) / 2); i++) { + r.i64[(i * 2)] = a.i64[i]; + r.i64[(i * 2) + 1] = b.i64[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_unpacklo_pd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_unpacklo_pd(a.n, b.n)); +#else + simde__m128d r; + SIMDE__VECTORIZE + for (size_t i = 0; i < ((sizeof(r) / sizeof(r.f64[0])) / 2); i++) { + r.f64[(i * 2)] = a.f64[i]; + r.f64[(i * 2) + 1] = b.f64[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d simde_mm_xor_pd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_C(_mm_xor_pd(a.n, b.n)); +#else + simde__m128d r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { + r.i64[i] = a.i64[i] ^ b.i64[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_mm_xor_si128(simde__m128i a, simde__m128i b) +{ +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_C(_mm_xor_si128(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C(i32, veorq_s32(a.neon_i32, b.neon_i32)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { + r.i32[i] = a.i32[i] ^ b.i32[i]; + } + return r; +#endif +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i simde_x_mm_not_si128(simde__m128i a) +{ +#if defined(SIMDE_SSE2_NEON) + return SIMDE__M128I_NEON_C(i32, vmvnq_s32(a.neon_i32)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { + r.i32[i] = ~(a.i32[i]); + } + return r; +#endif +} + +SIMDE__END_DECLS + +#endif /* !defined(SIMDE__SSE2_H) */ diff --git a/libobs/util/sse-intrin.h b/libobs/util/sse-intrin.h new file mode 100644 index 000000000..d15e0abe9 --- /dev/null +++ b/libobs/util/sse-intrin.h @@ -0,0 +1,66 @@ +/****************************************************************************** + Copyright (C) 2019 by Peter Geis + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +******************************************************************************/ + +#pragma once + +#ifdef __aarch64__ + +#include "aarch/sse2.h" + +#define __m128 simde__m128 +#define _mm_setzero_ps simde_mm_setzero_ps +#define _mm_set_ps simde_mm_set_ps +#define _mm_add_ps simde_mm_add_ps +#define _mm_sub_ps simde_mm_sub_ps +#define _mm_mul_ps simde_mm_mul_ps +#define _mm_div_ps simde_mm_div_ps +#define _mm_set1_ps simde_mm_set1_ps +#define _mm_movehl_ps simde_mm_movehl_ps +#define _mm_shuffle_ps simde_mm_shuffle_ps +#define _mm_min_ps simde_mm_min_ps +#define _mm_max_ps simde_mm_max_ps +#define _mm_movelh_ps simde_mm_movelh_ps +#define _mm_unpacklo_ps simde_mm_unpacklo_ps +#define _mm_unpackhi_ps simde_mm_unpackhi_ps +#define _mm_load_ps simde_mm_load_ps +#define _mm_andnot_ps simde_mm_andnot_ps +#define _mm_storeu_ps simde_mm_storeu_ps +#define _mm_loadu_ps simde_mm_loadu_ps + +#define __m128i simde__m128i +#define _mm_set1_epi32 simde_mm_set1_epi32 +#define _mm_set1_epi16 simde_mm_set1_epi16 +#define _mm_load_si128 simde_mm_load_si128 +#define _mm_packs_epi32 simde_mm_packs_epi32 +#define _mm_srli_si128 simde_mm_srli_si128 +#define _mm_and_si128 simde_mm_and_si128 +#define _mm_packus_epi16 simde_mm_packus_epi16 +#define _mm_add_epi64 simde_mm_add_epi64 +#define _mm_shuffle_epi32 simde_mm_shuffle_epi32 +#define _mm_srai_epi16 simde_mm_srai_epi16 +#define _mm_shufflelo_epi16 simde_mm_shufflelo_epi16 +#define _mm_storeu_si128 simde_mm_storeu_si128 + +#define _MM_SHUFFLE SIMDE_MM_SHUFFLE +#define _MM_TRANSPOSE4_PS SIMDE_MM_TRANSPOSE4_PS + +#else + +#include +#include + +#endif