#pragma once // DO NOT DEFINE STATIC DATA IN THIS HEADER! // See Note [Do not compile initializers with AVX] #include #include #include #if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF) #include #endif // Sleef offers vectorized versions of some transcedentals // such as sin, cos, tan etc.. // However for now opting for STL, since we are not building // with Sleef for mobile yet. namespace at { namespace vec { // See Note [CPU_CAPABILITY namespace] inline namespace CPU_CAPABILITY { // Right now contains only aarch64 implementation. // Due to follow two reasons aarch32 is not currently supported. // 1. Due to difference in ISA been aarch32 and aarch64, intrinsics // that work for aarch64 dont work for aarch32. // 2. Android NDK r21 has problems with compiling aarch32. // Clang seg faults. // https://github.com/android/ndk/issues/1248 // https://bugs.llvm.org/show_bug.cgi?id=45824 // Most likely we will do aarch32 support with inline asm. #if defined(__aarch64__) #ifdef __BIG_ENDIAN__ #error "Big endian is not supported." #endif #if defined(AT_BUILD_ARM_VEC256_WITH_SLEEF) #define USE_SLEEF(sleef_code, non_sleef_code) sleef_code #else #define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code #endif template struct BlendRegs { static float32x4_t impl( const float32x4_t& a, const float32x4_t& b, float32x4_t& res); }; template struct BlendRegs{ static float32x4_t impl( const float32x4_t& a, const float32x4_t& b, float32x4_t& res) { return vsetq_lane_f32(vgetq_lane_f32(b, index), res, index); } }; template struct BlendRegs{ static float32x4_t impl( const float32x4_t& a, const float32x4_t& b, float32x4_t& res) { return vsetq_lane_f32(vgetq_lane_f32(a, index), res, index); } }; template <> class Vectorized { private: float32x4x2_t values; public: using value_type = float; using size_type = int; static constexpr size_type size() { return 8; } Vectorized() {} Vectorized(float32x4x2_t v) : values(v) {} Vectorized(float val) : values{vdupq_n_f32(val), vdupq_n_f32(val) } {} Vectorized(float val0, float val1, float val2, float val3, float val4, float val5, float val6, float val7) : values{val0, val1, val2, val3, val4, val5, val6, val7} {} Vectorized(float32x4_t val0, float32x4_t val1) : values{val0, val1} {} operator float32x4x2_t() const { return values; } template static Vectorized blend(const Vectorized& a, const Vectorized& b) { Vectorized vec; // 0. vec.values.val[0] = BlendRegs<0, (mask & 0x01)!=0>::impl( a.values.val[0], b.values.val[0], vec.values.val[0]); vec.values.val[0] = BlendRegs<1, (mask & 0x02)!=0>::impl( a.values.val[0], b.values.val[0], vec.values.val[0]); vec.values.val[0] = BlendRegs<2, (mask & 0x04)!=0>::impl( a.values.val[0], b.values.val[0], vec.values.val[0]); vec.values.val[0] = BlendRegs<3, (mask & 0x08)!=0>::impl( a.values.val[0], b.values.val[0], vec.values.val[0]); // 1. vec.values.val[1] = BlendRegs<0, (mask & 0x10)!=0>::impl( a.values.val[1], b.values.val[1], vec.values.val[1]); vec.values.val[1] = BlendRegs<1, (mask & 0x20)!=0>::impl( a.values.val[1], b.values.val[1], vec.values.val[1]); vec.values.val[1] = BlendRegs<2, (mask & 0x40)!=0>::impl( a.values.val[1], b.values.val[1], vec.values.val[1]); vec.values.val[1] = BlendRegs<3, (mask & 0x80)!=0>::impl( a.values.val[1], b.values.val[1], vec.values.val[1]); return vec; } static Vectorized blendv(const Vectorized& a, const Vectorized& b, const Vectorized& mask) { // TODO // NB: This requires that each value, i.e., each uint value, // of the mask either all be zeros or all be 1s. // We perhaps need some kind of an assert? // But that will affect performance. Vectorized vec(mask.values); vec.values.val[0] = vbslq_f32( vreinterpretq_u32_f32(vec.values.val[0]), b.values.val[0], a.values.val[0]); vec.values.val[1] = vbslq_f32( vreinterpretq_u32_f32(vec.values.val[1]), b.values.val[1], a.values.val[1]); return vec; } template static Vectorized arange(float base = 0.f, step_t step = static_cast(1)) { const Vectorized base_vec(base); const Vectorized step_vec(step); const Vectorized step_sizes(0, 1, 2, 3, 4, 5, 6, 7); return fmadd(step_sizes, step_vec, base_vec); } static Vectorized set(const Vectorized& a, const Vectorized& b, int64_t count = size()) { switch (count) { case 0: return a; case 1: { Vectorized vec; static uint32x4_t mask_low = {0xFFFFFFFF, 0x0, 0x0, 0x0}; vec.values.val[0] = vreinterpretq_f32_u32(mask_low); vec.values.val[1] = a.values.val[1]; vec.values.val[0] = vbslq_f32( vreinterpretq_u32_f32(vec.values.val[0]), b.values.val[0], a.values.val[0]); return vec; } case 2: { Vectorized vec; static uint32x4_t mask_low = {0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0}; vec.values.val[0] = vreinterpretq_f32_u32(mask_low); vec.values.val[1] = a.values.val[1]; vec.values.val[0] = vbslq_f32( vreinterpretq_u32_f32(vec.values.val[0]), b.values.val[0], a.values.val[0]); return vec; } case 3: { Vectorized vec; static uint32x4_t mask_low = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0}; vec.values.val[0] = vreinterpretq_f32_u32(mask_low); vec.values.val[1] = a.values.val[1]; vec.values.val[0] = vbslq_f32( vreinterpretq_u32_f32(vec.values.val[0]), b.values.val[0], a.values.val[0]); return vec; } case 4: return Vectorized(b.values.val[0], a.values.val[1]); case 5: { Vectorized vec; static uint32x4_t mask_high = {0xFFFFFFFF, 0x0, 0x0, 0x0}; vec.values.val[0] = b.values.val[0]; vec.values.val[1] = vreinterpretq_f32_u32(mask_high); vec.values.val[1] = vbslq_f32( vreinterpretq_u32_f32(vec.values.val[1]), b.values.val[1], a.values.val[1]); return vec; } case 6: { Vectorized vec; static uint32x4_t mask_high = {0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0}; vec.values.val[0] = b.values.val[0]; vec.values.val[1] = vreinterpretq_f32_u32(mask_high); vec.values.val[1] = vbslq_f32( vreinterpretq_u32_f32(vec.values.val[1]), b.values.val[1], a.values.val[1]); return vec; } case 7: { Vectorized vec; static uint32x4_t mask_high = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0}; vec.values.val[0] = b.values.val[0]; vec.values.val[1] = vreinterpretq_f32_u32(mask_high); vec.values.val[1] = vbslq_f32( vreinterpretq_u32_f32(vec.values.val[1]), b.values.val[1], a.values.val[1]); return vec; } } return b; } static Vectorized loadu(const void* ptr, int64_t count = size()) { if (count == size()) { return vld1q_f32_x2(reinterpret_cast(ptr)); } else if (count == (size() >> 1)) { Vectorized res; res.values.val[0] = vld1q_f32(reinterpret_cast(ptr)); res.values.val[1] = vdupq_n_f32(0.f); return res; } else { __at_align__ float tmp_values[size()]; for (const auto i : c10::irange(size())) { tmp_values[i] = 0.0; } std::memcpy( tmp_values, reinterpret_cast(ptr), count * sizeof(float)); return vld1q_f32_x2(reinterpret_cast(tmp_values)); } } void store(void* ptr, int64_t count = size()) const { if (count == size()) { vst1q_f32_x2(reinterpret_cast(ptr), values); } else if (count == (size() >> 1)) { vst1q_f32(reinterpret_cast(ptr), values.val[0]); } else { float tmp_values[size()]; vst1q_f32_x2(reinterpret_cast(tmp_values), values); std::memcpy(ptr, tmp_values, count * sizeof(float)); } } inline const float32x4_t& get_low() const { return values.val[0]; } inline float32x4_t& get_low() { return values.val[0]; } inline const float32x4_t& get_high() const { return values.val[1]; } inline float32x4_t& get_high() { return values.val[1]; } // Very slow implementation of indexing. // Only required because vec256_qint refers to this. // Once we specialize that implementation for ARM // this should be removed. TODO (kimishpatel) float operator[](int idx) const { __at_align__ float tmp[size()]; store(tmp); return tmp[idx]; } float operator[](int idx) { __at_align__ float tmp[size()]; store(tmp); return tmp[idx]; } // For boolean version where we want to if any 1/all zero // etc. can be done faster in a different way. int zero_mask() const { __at_align__ float tmp[size()]; store(tmp); int mask = 0; for (int i = 0; i < size(); ++ i) { if (tmp[i] == 0.f) { mask |= (1 << i); } } return mask; } Vectorized isnan() const { __at_align__ float tmp[size()]; __at_align__ float res[size()]; store(tmp); for (const auto i : c10::irange(size())) { if (_isnan(tmp[i])) { std::memset(static_cast(&res[i]), 0xFF, sizeof(float)); } else { std::memset(static_cast(&res[i]), 0, sizeof(float)); } } return loadu(res); }; Vectorized map(float (*const f)(float)) const { __at_align__ float tmp[size()]; store(tmp); for (const auto i : c10::irange(size())) { tmp[i] = f(tmp[i]); } return loadu(tmp); } Vectorized abs() const { return Vectorized(vabsq_f32(values.val[0]), vabsq_f32(values.val[1])); } Vectorized angle() const { auto zero = Vectorized(0); auto pi = Vectorized(c10::pi); auto tmp = blendv(zero, pi, *this < zero); return blendv(tmp, *this, isnan()); } Vectorized real() const { return *this; } Vectorized imag() const { return Vectorized(0.f); } Vectorized conj() const { return *this; } Vectorized acos() const { return USE_SLEEF( Vectorized(Sleef_acosf4_u10(values.val[0]), Sleef_acosf4_u10(values.val[1])), map(std::acos) ); } Vectorized asin() const { return USE_SLEEF( Vectorized(Sleef_asinf4_u10(values.val[0]), Sleef_asinf4_u10(values.val[1])), map(std::asin) ); } Vectorized atan() const { return USE_SLEEF( Vectorized(Sleef_atanf4_u10(values.val[0]), Sleef_atanf4_u10(values.val[1])), map(std::atan) ); } Vectorized atan2(const Vectorized &exp) const { USE_SLEEF( { return Vectorized(Sleef_atan2f4_u10(values.val[0], exp.values.val[0]), Sleef_atan2f4_u10(values.val[1], exp.values.val[1])); }, { __at_align__ float tmp[size()]; __at_align__ float tmp_exp[size()]; store(tmp); exp.store(tmp_exp); for (const auto i : c10::irange(size())) { tmp[i] = std::atan2(tmp[i], tmp_exp[i]); } return loadu(tmp); } ) } Vectorized copysign(const Vectorized &sign) const { USE_SLEEF( { return Vectorized(Sleef_copysignf4(values.val[0], sign.values.val[0]), Sleef_copysignf4(values.val[1], sign.values.val[1])); }, { __at_align__ float tmp[size()]; __at_align__ float tmp_sign[size()]; store(tmp); sign.store(tmp_sign); for (size_type i = 0; i < size(); i++) { tmp[i] = std::copysign(tmp[i], tmp_sign[i]); } return loadu(tmp); } ) } Vectorized erf() const { return USE_SLEEF( Vectorized(Sleef_erff4_u10(values.val[0]), Sleef_erff4_u10(values.val[1])), map(std::erf); ); } Vectorized erfc() const { return USE_SLEEF( Vectorized(Sleef_erfcf4_u15(values.val[0]), Sleef_erfcf4_u15(values.val[1])), map(std::erfc) ); } Vectorized erfinv() const { return map(calc_erfinv); } Vectorized exp() const { return USE_SLEEF( Vectorized(Sleef_expf4_u10(values.val[0]), Sleef_expf4_u10(values.val[1])), map(std::exp) ); } Vectorized expm1() const { return USE_SLEEF( Vectorized(Sleef_expm1f4_u10(values.val[0]), Sleef_expm1f4_u10(values.val[1])), map(std::expm1) ); } Vectorized fmod(const Vectorized& q) const { USE_SLEEF( { return Vectorized(Sleef_fmodf4(values.val[0], q.values.val[0]), Sleef_fmodf4(values.val[1], q.values.val[1])); }, { __at_align__ float tmp[size()]; __at_align__ float tmp_q[size()]; store(tmp); q.store(tmp_q); for (const auto i : c10::irange(size())) { tmp[i] = std::fmod(tmp[i], tmp_q[i]); } return loadu(tmp); } ) } Vectorized hypot(const Vectorized &b) const { USE_SLEEF( { return Vectorized(Sleef_hypotf4_u05(values.val[0], b.values.val[0]), Sleef_hypotf4_u05(values.val[1], b.values.val[1])); }, { __at_align__ float tmp[size()]; __at_align__ float tmp_b[size()]; store(tmp); b.store(tmp_b); for (const auto i : c10::irange(size())) { tmp[i] = std::hypot(tmp[i], tmp_b[i]); } return loadu(tmp); } ) } Vectorized i0() const { return map(calc_i0); } Vectorized i0e() const { return map(calc_i0e); } Vectorized igamma(const Vectorized &x) const { __at_align__ float tmp[size()]; __at_align__ float tmp_x[size()]; store(tmp); x.store(tmp_x); for (const auto i : c10::irange(size())) { tmp[i] = calc_igamma(tmp[i], tmp_x[i]); } return loadu(tmp); } Vectorized igammac(const Vectorized &x) const { __at_align__ float tmp[size()]; __at_align__ float tmp_x[size()]; store(tmp); x.store(tmp_x); for (const auto i : c10::irange(size())) { tmp[i] = calc_igammac(tmp[i], tmp_x[i]); } return loadu(tmp); } Vectorized log() const { return USE_SLEEF( Vectorized(Sleef_logf4_u10(values.val[0]), Sleef_logf4_u10(values.val[1])), map(std::log) ); } Vectorized log10() const { return USE_SLEEF( Vectorized(Sleef_log10f4_u10(values.val[0]), Sleef_log10f4_u10(values.val[1])), map(std::log10) ); } Vectorized log1p() const { return USE_SLEEF( Vectorized(Sleef_log1pf4_u10(values.val[0]), Sleef_log1pf4_u10(values.val[1])), map(std::log1p) ); } Vectorized log2() const { return USE_SLEEF( Vectorized(Sleef_log2f4_u10(values.val[0]), Sleef_log2f4_u10(values.val[1])), map(std::log2) ); } Vectorized nextafter(const Vectorized &b) const { USE_SLEEF( { return Vectorized(Sleef_nextafterf4(values.val[0], b.values.val[0]), Sleef_nextafterf4(values.val[1], b.values.val[1])); }, { __at_align__ float tmp[size()]; __at_align__ float tmp_b[size()]; store(tmp); b.store(tmp_b); for (const auto i : c10::irange(size())) { tmp[i] = std::nextafter(tmp[i], tmp_b[i]); } return loadu(tmp); } ) } Vectorized frac() const; Vectorized sin() const { return USE_SLEEF( Vectorized(Sleef_sinf4_u10(values.val[0]), Sleef_sinf4_u10(values.val[1])), map(std::sin) ); } Vectorized sinh() const { return USE_SLEEF( Vectorized(Sleef_sinhf4_u10(values.val[0]), Sleef_sinhf4_u10(values.val[1])), map(std::sinh) ); } Vectorized cos() const { return USE_SLEEF( Vectorized(Sleef_cosf4_u10(values.val[0]), Sleef_cosf4_u10(values.val[1])), map(std::cos) ); } Vectorized cosh() const { return USE_SLEEF( Vectorized(Sleef_coshf4_u10(values.val[0]), Sleef_coshf4_u10(values.val[1])), map(std::cosh) ); } Vectorized ceil() const { return map(at::native::ceil_impl); } Vectorized floor() const { return map(at::native::floor_impl); } Vectorized neg() const { return Vectorized( vnegq_f32(values.val[0]), vnegq_f32(values.val[1])); } Vectorized round() const { // We do not use std::round because we would like to round midway numbers to the nearest even integer. return map(at::native::round_impl); } Vectorized tan() const { return USE_SLEEF( Vectorized(Sleef_tanf4_u10(values.val[0]), Sleef_tanf4_u10(values.val[1])), map(std::tan) ); } Vectorized tanh() const { return USE_SLEEF( Vectorized(Sleef_tanhf4_u10(values.val[0]), Sleef_tanhf4_u10(values.val[1])), map(std::tanh) ); } Vectorized trunc() const { float32x4_t r0 = vrndq_f32(values.val[0]); float32x4_t r1 = vrndq_f32(values.val[1]); return Vectorized(r0, r1); } Vectorized lgamma() const { return USE_SLEEF( Vectorized(Sleef_lgammaf4_u10(values.val[0]), Sleef_lgammaf4_u10(values.val[1])), map(std::lgamma) ); } Vectorized sqrt() const { return Vectorized( vsqrtq_f32(values.val[0]), vsqrtq_f32(values.val[1])); } Vectorized reciprocal() const { auto r0 = vdivq_f32(vdupq_n_f32(1.0f), values.val[0]); auto r1 = vdivq_f32(vdupq_n_f32(1.0f), values.val[1]); return Vectorized(r0, r1); } Vectorized rsqrt() const { return this->sqrt().reciprocal(); } Vectorized pow(const Vectorized &exp) const { USE_SLEEF( { return Vectorized(Sleef_powf4_u10(values.val[0], exp.values.val[0]), Sleef_powf4_u10(values.val[1], exp.values.val[1])); }, { __at_align__ float tmp[size()]; __at_align__ float tmp_exp[size()]; store(tmp); exp.store(tmp_exp); for (const auto i : c10::irange(size())) { tmp[i] = std::pow(tmp[i], tmp_exp[i]); } return loadu(tmp); } ) } Vectorized operator==(const Vectorized& other) const { float32x4_t r0 = vreinterpretq_f32_u32(vceqq_f32(values.val[0], other.values.val[0])); float32x4_t r1 = vreinterpretq_f32_u32(vceqq_f32(values.val[1], other.values.val[1])); return Vectorized(r0, r1); } Vectorized operator!=(const Vectorized& other) const { float32x4_t r0 = vreinterpretq_f32_u32( vmvnq_u32(vceqq_f32(values.val[0], other.values.val[0]))); float32x4_t r1 = vreinterpretq_f32_u32( vmvnq_u32(vceqq_f32(values.val[1], other.values.val[1]))); return Vectorized(r0, r1); } Vectorized operator<(const Vectorized& other) const { float32x4_t r0 = vreinterpretq_f32_u32(vcltq_f32(values.val[0], other.values.val[0])); float32x4_t r1 = vreinterpretq_f32_u32(vcltq_f32(values.val[1], other.values.val[1])); return Vectorized(r0, r1); } Vectorized operator<=(const Vectorized& other) const { float32x4_t r0 = vreinterpretq_f32_u32(vcleq_f32(values.val[0], other.values.val[0])); float32x4_t r1 = vreinterpretq_f32_u32(vcleq_f32(values.val[1], other.values.val[1])); return Vectorized(r0, r1); } Vectorized operator>(const Vectorized& other) const { float32x4_t r0 = vreinterpretq_f32_u32(vcgtq_f32(values.val[0], other.values.val[0])); float32x4_t r1 = vreinterpretq_f32_u32(vcgtq_f32(values.val[1], other.values.val[1])); return Vectorized(r0, r1); } Vectorized operator>=(const Vectorized& other) const { float32x4_t r0 = vreinterpretq_f32_u32(vcgeq_f32(values.val[0], other.values.val[0])); float32x4_t r1 = vreinterpretq_f32_u32(vcgeq_f32(values.val[1], other.values.val[1])); return Vectorized(r0, r1); } Vectorized eq(const Vectorized& other) const; Vectorized ne(const Vectorized& other) const; Vectorized gt(const Vectorized& other) const; Vectorized ge(const Vectorized& other) const; Vectorized lt(const Vectorized& other) const; Vectorized le(const Vectorized& other) const; }; template <> Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { float32x4_t r0 = vaddq_f32(a.get_low(), b.get_low()); float32x4_t r1 = vaddq_f32(a.get_high(), b.get_high()); return Vectorized(r0, r1); } template <> Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { float32x4_t r0 = vsubq_f32(a.get_low(), b.get_low()); float32x4_t r1 = vsubq_f32(a.get_high(), b.get_high()); return Vectorized(r0, r1); } template <> Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { float32x4_t r0 = vmulq_f32(a.get_low(), b.get_low()); float32x4_t r1 = vmulq_f32(a.get_high(), b.get_high()); return Vectorized(r0, r1); } template <> Vectorized inline operator/(const Vectorized& a, const Vectorized& b) { float32x4_t r0 = vdivq_f32(a.get_low(), b.get_low()); float32x4_t r1 = vdivq_f32(a.get_high(), b.get_high()); return Vectorized(r0, r1); } // frac. Implement this here so we can use subtraction inline Vectorized Vectorized::frac() const { return *this - this->trunc(); } // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if // either input is a NaN. template <> Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { float32x4_t r0 = vmaxq_f32(a.get_low(), b.get_low()); float32x4_t r1 = vmaxq_f32(a.get_high(), b.get_high()); return Vectorized(r0, r1); } // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if // either input is a NaN. template <> Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { float32x4_t r0 = vminq_f32(a.get_low(), b.get_low()); float32x4_t r1 = vminq_f32(a.get_high(), b.get_high()); return Vectorized(r0, r1); } template <> Vectorized inline clamp(const Vectorized& a, const Vectorized& min, const Vectorized& max) { return minimum(max, maximum(min, a)); } template <> Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max) { return minimum(max, a); } template <> Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min) { return maximum(min, a); } template <> Vectorized inline operator&(const Vectorized& a, const Vectorized& b) { float32x4_t r0 = vreinterpretq_f32_u32(vandq_u32( vreinterpretq_u32_f32(a.get_low()), vreinterpretq_u32_f32(b.get_low()))); float32x4_t r1 = vreinterpretq_f32_u32(vandq_u32( vreinterpretq_u32_f32(a.get_high()), vreinterpretq_u32_f32(b.get_high()))); return Vectorized(r0, r1); } template <> Vectorized inline operator|(const Vectorized& a, const Vectorized& b) { float32x4_t r0 = vreinterpretq_f32_u32(vorrq_u32( vreinterpretq_u32_f32(a.get_low()), vreinterpretq_u32_f32(b.get_low()))); float32x4_t r1 = vreinterpretq_f32_u32(vorrq_u32( vreinterpretq_u32_f32(a.get_high()), vreinterpretq_u32_f32(b.get_high()))); return Vectorized(r0, r1); } template <> Vectorized inline operator^(const Vectorized& a, const Vectorized& b) { float32x4_t r0 = vreinterpretq_f32_u32(veorq_u32( vreinterpretq_u32_f32(a.get_low()), vreinterpretq_u32_f32(b.get_low()))); float32x4_t r1 = vreinterpretq_f32_u32(veorq_u32( vreinterpretq_u32_f32(a.get_high()), vreinterpretq_u32_f32(b.get_high()))); return Vectorized(r0, r1); } inline Vectorized Vectorized::eq(const Vectorized& other) const { return (*this == other) & Vectorized(1.0f); } inline Vectorized Vectorized::ne(const Vectorized& other) const { return (*this != other) & Vectorized(1.0f); } inline Vectorized Vectorized::gt(const Vectorized& other) const { return (*this > other) & Vectorized(1.0f); } inline Vectorized Vectorized::ge(const Vectorized& other) const { return (*this >= other) & Vectorized(1.0f); } inline Vectorized Vectorized::lt(const Vectorized& other) const { return (*this < other) & Vectorized(1.0f); } inline Vectorized Vectorized::le(const Vectorized& other) const { return (*this <= other) & Vectorized(1.0f); } template <> inline void convert(const float* src, int32_t* dst, int64_t n) { int64_t i; #pragma unroll for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { vst1q_s32(dst + i, vcvtq_s32_f32(vld1q_f32(src + i))); vst1q_s32(dst + i + 4, vcvtq_s32_f32(vld1q_f32(src + i + 4))); } #pragma unroll for (; i < n; i++) { dst[i] = static_cast(src[i]); } } template <> inline void convert(const int32_t* src, float* dst, int64_t n) { int64_t i; #pragma unroll for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { vst1q_f32(dst + i, vcvtq_f32_s32(vld1q_s32(src + i))); vst1q_f32(dst + i + 4, vcvtq_f32_s32(vld1q_s32(src + i + 4))); } #pragma unroll for (; i < n; i++) { dst[i] = static_cast(src[i]); } } template <> Vectorized inline fmadd(const Vectorized& a, const Vectorized& b, const Vectorized& c) { float32x4_t r0 = vfmaq_f32(c.get_low(), a.get_low(), b.get_low()); float32x4_t r1 = vfmaq_f32(c.get_high(), a.get_high(), b.get_high()); return Vectorized(r0, r1); } #endif /* defined(aarch64) */ }}}