/* * Copyright (c) Meta Platforms, Inc. and affiliates. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include #include #include #include #include #include #if FOLLY_X64 && FOLLY_SSE_PREREQ(4, 2) #include #endif #if FOLLY_AARCH64 #include #endif namespace folly { namespace simd::detail { /** * SimdPlatform * * Common interface for some SIMD operations between: sse4.2, avx2, * arm-neon. * * Supported types for T at the moment are uint8_16/uint16_t/uint32_t/uint64_t * * If it's not one of the supported platforms: * std::same_as, void> * There is also a macro: FOLLY_DETAIL_HAS_SIMD_PLATFORM set to 1 or 0 * **/ #if FOLLY_X64 && FOLLY_SSE_PREREQ(4, 2) || FOLLY_AARCH64 template struct SimdPlatformCommon { /** * sclar_t - type of scalar we operate on (uint8_t, uint16_t etc) * reg_t - type of a simd register (__m128i) * logical_t - type of a simd logical register (matches reg_t so far) **/ using scalar_t = typename Platform::scalar_t; using reg_t = typename Platform::reg_t; using logical_t = typename Platform::logical_t; static constexpr int kCardinal = sizeof(reg_t) / sizeof(scalar_t); /** * loads: * precondition: at least one element should be not ignored. * * loada - load from an aligned (to sizeof(reg_t)) address * loadu - load from an unaligned address * unsafeLoadu - load from an unaligned address that disables sanitizers. * This is for reading a register within a page * but maybe outside of the array's boundary. * * Ignored values can be garbage. **/ template static reg_t loada(const scalar_t* ptr, Ignore); static reg_t loadu(const scalar_t* ptr, ignore_none); static reg_t unsafeLoadu(const scalar_t* ptr, ignore_none); /** * Comparing reg_t against the scalar. * * NOTE: less_equal only implemented for uint8_t * for now. **/ static logical_t equal(reg_t reg, scalar_t x); static logical_t less_equal(reg_t reg, scalar_t x); /** * logical reduction **/ template static bool any(logical_t logical, Ignore ignore); template static bool all(logical_t logical, Ignore ignore); /** * logical operations **/ static logical_t logical_or(logical_t x, logical_t y); /** * Converting register to an array for debugging **/ static auto toArray(reg_t x); }; template template FOLLY_ERASE auto SimdPlatformCommon::loada( const scalar_t* ptr, [[maybe_unused]] Ignore ignore) -> reg_t { if constexpr (std::is_same_v) { // There is not point to aligned load instructions // on modern cpus. Arm doesn't even have any. return loadu(ptr, ignore_none{}); } else { // We have a precondition: at least one element is loaded. // From this we can prove that we can unsafely load from // and aligned address. // // Here is an explanation from Stephen Canon: // https://stackoverflow.com/questions/25566302/vectorized-strlen-getting-away-with-reading-unallocated-memory if constexpr (!kIsSanitizeAddress) { return unsafeLoadu(ptr, ignore_none{}); } else { // If the sanitizers are enabled, we want to trigger the issues. // We also want to match the garbage values with/without asan, // so that testing works on the same values as prod. scalar_t buf[kCardinal]; std::memcpy( buf + ignore.first, ptr + ignore.first, (kCardinal - ignore.first - ignore.last) * sizeof(scalar_t)); auto testAgainst = loadu(buf, ignore_none{}); auto res = unsafeLoadu(ptr, ignore_none{}); // Extra sanity check. FOLLY_SAFE_CHECK(all(Platform::equal(res, testAgainst), ignore)); return res; } } } template FOLLY_ERASE auto SimdPlatformCommon::loadu( const scalar_t* ptr, ignore_none) -> reg_t { return Platform::loadu(ptr); } template FOLLY_ERASE auto SimdPlatformCommon::unsafeLoadu( const scalar_t* ptr, ignore_none) -> reg_t { return Platform::unsafeLoadu(ptr); } template FOLLY_ERASE auto SimdPlatformCommon::equal(reg_t reg, scalar_t x) -> logical_t { return Platform::equal(reg, Platform::broadcast(x)); } template FOLLY_ERASE auto SimdPlatformCommon::less_equal(reg_t reg, scalar_t x) -> logical_t { static_assert(std::is_same_v, "not implemented"); return Platform::less_equal(reg, Platform::broadcast(x)); } template template FOLLY_ERASE bool SimdPlatformCommon::any( logical_t logical, Ignore ignore) { if constexpr (std::is_same_v) { return Platform::any(logical); } else { return movemask(logical, ignore).first; } } template template FOLLY_ERASE bool SimdPlatformCommon::all( logical_t logical, Ignore ignore) { if constexpr (std::is_same_v) { return Platform::all(logical); } else { auto [bits, bitsPerElement] = movemask(logical, ignore_none{}); auto expected = n_least_significant_bits( bitsPerElement * (kCardinal - ignore.last)); expected = clear_n_least_significant_bits(expected, ignore.first * bitsPerElement); return (bits & expected) == expected; } } template FOLLY_ERASE auto SimdPlatformCommon::logical_or( logical_t x, logical_t y) -> logical_t { return Platform::logical_or(x, y); } template FOLLY_ERASE auto SimdPlatformCommon::toArray(reg_t x) { std::array res; std::memcpy(&res, &x, sizeof(x)); return res; } #endif #if FOLLY_X64 && FOLLY_SSE_PREREQ(4, 2) template struct SimdSse42PlatformSpecific { using scalar_t = T; using reg_t = __m128i; using logical_t = reg_t; static constexpr std::size_t kCardinal = sizeof(reg_t) / sizeof(scalar_t); FOLLY_ERASE static reg_t loadu(const scalar_t* p) { return _mm_loadu_si128(reinterpret_cast(p)); } FOLLY_DISABLE_SANITIZERS FOLLY_ERASE static reg_t unsafeLoadu(const scalar_t* p) { return _mm_loadu_si128(reinterpret_cast(p)); } FOLLY_ERASE static reg_t broadcast(scalar_t x) { if constexpr (std::is_same_v) { return _mm_set1_epi8(x); } else if constexpr (std::is_same_v) { return _mm_set1_epi16(x); } else if constexpr (std::is_same_v) { return _mm_set1_epi32(x); } else if constexpr (std::is_same_v) { return _mm_set1_epi64x(x); } } FOLLY_ERASE static logical_t equal(reg_t x, reg_t y) { if constexpr (std::is_same_v) { return _mm_cmpeq_epi8(x, y); } else if constexpr (std::is_same_v) { return _mm_cmpeq_epi16(x, y); } else if constexpr (std::is_same_v) { return _mm_cmpeq_epi32(x, y); } else if constexpr (std::is_same_v) { return _mm_cmpeq_epi64(x, y); } } FOLLY_ERASE static logical_t less_equal(reg_t x, reg_t y) { static_assert( std::is_same_v, "other types not implemented"); // No unsigned comparisons on x86 // less equal <=> equal (min) reg_t min = _mm_min_epu8(x, y); return equal(x, min); } FOLLY_ERASE static logical_t logical_or(logical_t x, logical_t y) { return _mm_or_si128(x, y); } FOLLY_ERASE static bool any(logical_t log) { return movemask(log).first; } #if 0 // disabled untill we have a test where this is relevant FOLLY_ERASE static bool all(logical_t log) { auto [bits, bitsPerElement] = movemask(log); return movemask(log) == n_least_significant_bits(kCardinal * bitsPerElement); } #endif }; #define FOLLY_DETAIL_HAS_SIMD_PLATFORM 1 template struct SimdSse42Platform : SimdPlatformCommon> {}; #if defined(__AVX2__) template struct SimdAvx2PlatformSpecific { using scalar_t = T; using reg_t = __m256i; using logical_t = reg_t; static constexpr std::size_t kCardinal = sizeof(reg_t) / sizeof(scalar_t); FOLLY_ERASE static reg_t loadu(const scalar_t* p) { return _mm256_loadu_si256(reinterpret_cast(p)); } FOLLY_DISABLE_SANITIZERS FOLLY_ERASE static reg_t unsafeLoadu(const scalar_t* p) { return _mm256_loadu_si256(reinterpret_cast(p)); } FOLLY_ERASE static reg_t broadcast(scalar_t x) { if constexpr (std::is_same_v) { return _mm256_set1_epi8(x); } else if constexpr (std::is_same_v) { return _mm256_set1_epi16(x); } else if constexpr (std::is_same_v) { return _mm256_set1_epi32(x); } else if constexpr (std::is_same_v) { return _mm256_set1_epi64x(x); } } FOLLY_ERASE static logical_t equal(reg_t x, reg_t y) { if constexpr (std::is_same_v) { return _mm256_cmpeq_epi8(x, y); } else if constexpr (std::is_same_v) { return _mm256_cmpeq_epi16(x, y); } else if constexpr (std::is_same_v) { return _mm256_cmpeq_epi32(x, y); } else if constexpr (std::is_same_v) { return _mm256_cmpeq_epi64(x, y); } } FOLLY_ERASE static logical_t less_equal(reg_t x, reg_t y) { static_assert( std::is_same_v, "other types not implemented"); // See SSE comment reg_t min = _mm256_min_epu8(x, y); return _mm256_cmpeq_epi8(x, min); } FOLLY_ERASE static logical_t logical_or(logical_t x, logical_t y) { return _mm256_or_si256(x, y); } FOLLY_ERASE static bool any(logical_t log) { return simd::movemask(log).first; } #if 0 // disabled untill we have a test where this is relevant FOLLY_ERASE static bool all(logical_t log) { auto [bits, bitsPerElement] = movemask(log); return movemask(log) == n_least_significant_bits(kCardinal * bitsPerElement); } #endif }; template struct SimdAvx2Platform : SimdPlatformCommon> {}; template using SimdPlatform = SimdAvx2Platform; #else template using SimdPlatform = SimdSse42Platform; #endif #elif FOLLY_AARCH64 template struct SimdAarch64PlatformSpecific { using scalar_t = T; FOLLY_ERASE static auto loadu(const scalar_t* p) { if constexpr (std::is_same_v) { return vld1q_u8(p); } else if constexpr (std::is_same_v) { return vld1q_u16(p); } else if constexpr (std::is_same_v) { return vld1q_u32(p); } else if constexpr (std::is_same_v) { return vld1q_u64(p); } } using reg_t = decltype(loadu(nullptr)); using logical_t = reg_t; FOLLY_DISABLE_SANITIZERS FOLLY_ERASE static reg_t unsafeLoadu(const scalar_t* p) { if constexpr (std::is_same_v) { return vld1q_u8(p); } else if constexpr (std::is_same_v) { return vld1q_u16(p); } else if constexpr (std::is_same_v) { return vld1q_u32(p); } else if constexpr (std::is_same_v) { return vld1q_u64(p); } } FOLLY_ERASE static reg_t broadcast(scalar_t x) { if constexpr (std::is_same_v) { return vdupq_n_u8(x); } else if constexpr (std::is_same_v) { return vdupq_n_u16(x); } else if constexpr (std::is_same_v) { return vdupq_n_u32(x); } else if constexpr (std::is_same_v) { return vdupq_n_u64(x); } } FOLLY_ERASE static logical_t equal(reg_t x, reg_t y) { if constexpr (std::is_same_v) { return vceqq_u8(x, y); } else if constexpr (std::is_same_v) { return vceqq_u16(x, y); } else if constexpr (std::is_same_v) { return vceqq_u32(x, y); } else if constexpr (std::is_same_v) { return vceqq_u64(x, y); } } FOLLY_ERASE static logical_t less_equal(reg_t x, reg_t y) { if constexpr (std::is_same_v) { return vcleq_u8(x, y); } else if constexpr (std::is_same_v) { return vcleq_u16(x, y); } else if constexpr (std::is_same_v) { return vcleq_u32(x, y); } else if constexpr (std::is_same_v) { return vcleq_u64(x, y); } } FOLLY_ALWAYS_INLINE static logical_t logical_or(logical_t x, logical_t y) { if constexpr (std::is_same_v) { return vorrq_u8(x, y); } else if constexpr (std::is_same_v) { return vorrq_u16(x, y); } else if constexpr (std::is_same_v) { return vorrq_u32(x, y); } else if constexpr (std::is_same_v) { return vorrq_u64(x, y); } } FOLLY_ALWAYS_INLINE static bool any(logical_t log) { // https://github.com/dotnet/runtime/pull/75864 auto u32 = bit_cast(log); u32 = vpmaxq_u32(u32, u32); auto u64 = bit_cast(u32); return vgetq_lane_u64(u64, 0); } #if 0 // disabled untill we have a test where this is relevant FOLLY_ERASE static bool all(logical_t log) { // Not quite what they did in .Net runtime, but // should be close. // https://github.com/dotnet/runtime/pull/75864 auto u32 = bit_cast(log); u32 = vpminq_u32(u32, u32); auto u64 = bit_cast(u32); return u64 == n_least_significant_bits(64); } #endif }; #define FOLLY_DETAIL_HAS_SIMD_PLATFORM 1 template struct SimdAarch64Platform : SimdPlatformCommon> {}; template using SimdPlatform = SimdAarch64Platform; #define FOLLY_DETAIL_HAS_SIMD_PLATFORM 1 #else #define FOLLY_DETAIL_HAS_SIMD_PLATFORM 0 template using SimdPlatform = void; #endif } // namespace simd::detail } // namespace folly