#pragma once #if defined(__ARM_NEON__) || defined(__ARM_NEON) #if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || (__BYTE_ORDER__) != (__ORDER_LITTLE_ENDIAN__) #error "Currently, USE_NEON is only for little-endian." #endif #include #include #include "portable.h" #include "multi2_block.h" namespace multi2 { namespace arm { class neon { private: uint32x4_t v; public: inline neon() { } inline neon(uint32_t n) { v = vdupq_n_u32(n); } inline neon(const uint32x4_t &r) { v = r; } inline neon &operator=(const neon &other) { v = other.v; return *this; } inline neon operator+(const neon &other) const { return vaddq_u32(v, other.v); } inline neon operator-(const neon &other) const { return vsubq_u32(v, other.v); } inline neon operator^(const neon &other) const { return veorq_u32(v, other.v); } inline neon operator|(const neon &other) const { return vorrq_u32(v, other.v); } inline const uint32x4_t &value() const { return v; } }; } template<> inline void block::load(const uint8_t *p) { const uint32_t *q = reinterpret_cast(p); uint32x4x2_t a = vld2q_u32(q); left = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(a.val[0]))); right = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(a.val[1]))); } template<> inline void block::store(uint8_t *p) const { uint32x4_t a0 = left.value(); uint32x4_t a1 = right.value(); uint32x4_t b0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(a0))); uint32x4_t b1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(a1))); uint32_t *q = reinterpret_cast(p); uint32x4x2_t d = { b0, b1 }; vst2q_u32(q, d); } template<> inline std::pair, cbc_state> block::cbc_post_decrypt(const block &c, const cbc_state &state) const { uint32x4_t c0 = c.left.value(); // 3 2 1 0 uint32x4_t c1 = c.right.value(); uint32_t s0 = vgetq_lane_u32(c0, 3); // 3 uint32_t s1 = vgetq_lane_u32(c1, 3); uint32x4_t b0 = vextq_u32(c0, c0, 3); // 2 1 0 3 uint32x4_t b1 = vextq_u32(c1, c1, 3); uint32x4_t x0 = vsetq_lane_u32(state.left, b0, 0); // 2 1 0 s uint32x4_t x1 = vsetq_lane_u32(state.right, b1, 0); uint32x4_t d0 = left.value(); // 3 2 1 0 uint32x4_t d1 = right.value(); uint32x4_t p0 = veorq_u32(d0, x0); uint32x4_t p1 = veorq_u32(d1, x1); return std::make_pair(block(p0, p1), cbc_state(s0, s1)); } template inline arm::neon rot(const arm::neon &v) { uint32x4_t a = v.value(); if (N == 16) { return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(a))); } else { return vsliq_n_u32(vshrq_n_u32(a, 32 - N), a, N); } } template<> inline arm::neon rot1_sub(const arm::neon &v) { uint32x4_t a = v.value(); return vsraq_n_u32(a, a, 31); } template<> inline arm::neon rot1_add_dec(const arm::neon &v) { uint32x4_t d = vcgeq_s32(vreinterpretq_s32_u32(v.value()), vdupq_n_s32(0)); return v + v + v + arm::neon(d); } } #endif /* __ARM_NEON__ || __ARM_NEON */