From 3f17c80a35853f56ba6480b56923eef8438fa367 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 2 Jun 2026 14:21:08 -0400 Subject: [PATCH] perf(avx512vl): native EVEX for int64 sra, signed rotr, compress/expand Add pure-VL overrides so these stop falling back to AVX2/SSE: - int64 signed >> -> vpsraq/vpsravq (unsigned/32-bit unchanged) - signed rotr -> vprorvq/vprolq (drop is_unsigned guard, mirror rotl) - compress/expand -> EVEX forms; 8/16-bit fall through to common{} --- include/xsimd/arch/xsimd_avx512vl_128.hpp | 147 +++++++++++++++++++--- include/xsimd/arch/xsimd_avx512vl_256.hpp | 147 +++++++++++++++++++--- 2 files changed, 254 insertions(+), 40 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx512vl_128.hpp b/include/xsimd/arch/xsimd_avx512vl_128.hpp index 855870af3..1df440e37 100644 --- a/include/xsimd/arch/xsimd_avx512vl_128.hpp +++ b/include/xsimd/arch/xsimd_avx512vl_128.hpp @@ -378,6 +378,48 @@ namespace xsimd return (typename batch_bool::register_type)_mm_cmp_pd_mask(self, self, _CMP_UNORD_Q); } + // bitwise_rshift — signed int64 uses the native EVEX arithmetic shift + // (VPSRAQ / VPSRAVQ, lat 1 / CPI 0.5). Every other width/sign keeps the + // inherited avx2_128 codegen (srai/srav for 32-bit, srli for unsigned 64). + template ::value>> + XSIMD_INLINE batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_signed::value && sizeof(T) == 8) + { + return _mm_srai_epi64(self, other); + } + else + { + return bitwise_rshift(self, other, avx2_128 {}); + } + } + template ::value>> + XSIMD_INLINE batch bitwise_rshift(batch const& self, requires_arch) noexcept + { + constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; + static_assert(shift < bits, "Shift amount must be less than the number of bits in T"); + XSIMD_IF_CONSTEXPR(std::is_signed::value && sizeof(T) == 8) + { + return _mm_srai_epi64(self, shift); + } + else + { + return bitwise_rshift(self, avx2_128 {}); + } + } + template ::value>> + XSIMD_INLINE batch bitwise_rshift(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_signed::value && sizeof(T) == 8) + { + return _mm_srav_epi64(self, other); + } + else + { + return bitwise_rshift(self, other, avx2_128 {}); + } + } + // rotl template ::value>> XSIMD_INLINE batch rotl(batch const& self, batch const& other, requires_arch) noexcept @@ -423,18 +465,18 @@ namespace xsimd template ::value>> XSIMD_INLINE batch rotr(batch const& self, batch const& other, requires_arch) noexcept { - XSIMD_IF_CONSTEXPR(std::is_unsigned::value) + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { - XSIMD_IF_CONSTEXPR(sizeof(T) == 4) - { - return _mm_rorv_epi32(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) - { - return _mm_rorv_epi64(self, other); - } + return _mm_rorv_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_rorv_epi64(self, other); + } + else + { + return rotr(self, other, avx2_128 {}); } - return rotr(self, other, avx2_128 {}); } template ::value>> XSIMD_INLINE batch rotr(batch const& self, int32_t other, requires_arch) noexcept @@ -447,18 +489,83 @@ namespace xsimd { constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; static_assert(count < bits, "Count must be less than the number of bits in T"); - XSIMD_IF_CONSTEXPR(std::is_unsigned::value) + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { - XSIMD_IF_CONSTEXPR(sizeof(T) == 4) - { - return _mm_ror_epi32(self, count); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) - { - return _mm_ror_epi64(self, count); - } + return _mm_ror_epi32(self, count); } - return rotr(self, avx2_128 {}); + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_ror_epi64(self, count); + } + else + { + return rotr(self, avx2_128 {}); + } + } + + // compress — native EVEX VPCOMPRESS{PS,PD,Q,D} for the widths with VL + // forms. 8/16-bit need AVX512_VBMI2, so they fall through to common{}. + template + XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm_maskz_compress_ps(mask.mask(), self); + } + template + XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm_maskz_compress_pd(mask.mask(), self); + } + template + XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm_maskz_compress_epi32(mask.mask(), self); + } + template + XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm_maskz_compress_epi32(mask.mask(), self); + } + template + XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm_maskz_compress_epi64(mask.mask(), self); + } + template + XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm_maskz_compress_epi64(mask.mask(), self); + } + + // expand + template + XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm_maskz_expand_ps(mask.mask(), self); + } + template + XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm_maskz_expand_pd(mask.mask(), self); + } + template + XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm_maskz_expand_epi32(mask.mask(), self); + } + template + XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm_maskz_expand_epi32(mask.mask(), self); + } + template + XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm_maskz_expand_epi64(mask.mask(), self); + } + template + XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm_maskz_expand_epi64(mask.mask(), self); } // all diff --git a/include/xsimd/arch/xsimd_avx512vl_256.hpp b/include/xsimd/arch/xsimd_avx512vl_256.hpp index c0b4a568e..730cf6724 100644 --- a/include/xsimd/arch/xsimd_avx512vl_256.hpp +++ b/include/xsimd/arch/xsimd_avx512vl_256.hpp @@ -460,6 +460,48 @@ namespace xsimd return (typename batch_bool::register_type)_mm256_cmp_pd_mask(self, self, _CMP_UNORD_Q); } + // bitwise_rshift — signed int64 uses the native EVEX arithmetic shift + // (VPSRAQ / VPSRAVQ, lat 1 / CPI 0.5). Every other width/sign keeps the + // inherited avx2 codegen (srai/srav for 32-bit, srli for unsigned 64). + template ::value>> + XSIMD_INLINE batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_signed::value && sizeof(T) == 8) + { + return _mm256_srai_epi64(self, other); + } + else + { + return bitwise_rshift(self, other, avx2 {}); + } + } + template ::value>> + XSIMD_INLINE batch bitwise_rshift(batch const& self, requires_arch) noexcept + { + constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; + static_assert(shift < bits, "Shift amount must be less than the number of bits in T"); + XSIMD_IF_CONSTEXPR(std::is_signed::value && sizeof(T) == 8) + { + return _mm256_srai_epi64(self, shift); + } + else + { + return bitwise_rshift(self, avx2 {}); + } + } + template ::value>> + XSIMD_INLINE batch bitwise_rshift(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_signed::value && sizeof(T) == 8) + { + return _mm256_srav_epi64(self, other); + } + else + { + return bitwise_rshift(self, other, avx2 {}); + } + } + // rotl template ::value>> XSIMD_INLINE batch rotl(batch const& self, batch const& other, requires_arch) noexcept @@ -505,18 +547,18 @@ namespace xsimd template ::value>> XSIMD_INLINE batch rotr(batch const& self, batch const& other, requires_arch) noexcept { - XSIMD_IF_CONSTEXPR(std::is_unsigned::value) + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { - XSIMD_IF_CONSTEXPR(sizeof(T) == 4) - { - return _mm256_rorv_epi32(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) - { - return _mm256_rorv_epi64(self, other); - } + return _mm256_rorv_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_rorv_epi64(self, other); + } + else + { + return rotr(self, other, avx2 {}); } - return rotr(self, other, avx2 {}); } template ::value>> XSIMD_INLINE batch rotr(batch const& self, int32_t other, requires_arch) noexcept @@ -529,18 +571,83 @@ namespace xsimd { constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; static_assert(count < bits, "Count must be less than the number of bits in T"); - XSIMD_IF_CONSTEXPR(std::is_unsigned::value) + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { - XSIMD_IF_CONSTEXPR(sizeof(T) == 4) - { - return _mm256_ror_epi32(self, count); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) - { - return _mm256_ror_epi64(self, count); - } + return _mm256_ror_epi32(self, count); } - return rotr(self, avx2 {}); + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_ror_epi64(self, count); + } + else + { + return rotr(self, avx2 {}); + } + } + + // compress — native EVEX VPCOMPRESS{PS,PD,Q,D} for the widths with VL + // forms. 8/16-bit need AVX512_VBMI2, so they fall through to common{}. + template + XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm256_maskz_compress_ps(mask.mask(), self); + } + template + XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm256_maskz_compress_pd(mask.mask(), self); + } + template + XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm256_maskz_compress_epi32(mask.mask(), self); + } + template + XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm256_maskz_compress_epi32(mask.mask(), self); + } + template + XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm256_maskz_compress_epi64(mask.mask(), self); + } + template + XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm256_maskz_compress_epi64(mask.mask(), self); + } + + // expand + template + XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm256_maskz_expand_ps(mask.mask(), self); + } + template + XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm256_maskz_expand_pd(mask.mask(), self); + } + template + XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm256_maskz_expand_epi32(mask.mask(), self); + } + template + XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm256_maskz_expand_epi32(mask.mask(), self); + } + template + XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm256_maskz_expand_epi64(mask.mask(), self); + } + template + XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm256_maskz_expand_epi64(mask.mask(), self); } // all