diff --git a/polyval/benches/polyval.rs b/polyval/benches/polyval.rs index 6d613e4..bea9fc9 100644 --- a/polyval/benches/polyval.rs +++ b/polyval/benches/polyval.rs @@ -4,10 +4,7 @@ extern crate test; -use polyval::{ - Polyval, - universal_hash::{KeyInit, UniversalHash}, -}; +use polyval::{Polyval, universal_hash::UniversalHash}; use test::Bencher; // TODO(tarcieri): move this into the `universal-hash` crate diff --git a/polyval/src/field_element.rs b/polyval/src/field_element.rs index b7ce940..a42913f 100644 --- a/polyval/src/field_element.rs +++ b/polyval/src/field_element.rs @@ -3,31 +3,15 @@ mod soft; use crate::{BLOCK_SIZE, Block}; -use core::fmt; -use core::fmt::Debug; -use core::ops::{Add, Mul, MulAssign}; +use core::{ + fmt::{self, Debug}, + ops::{Add, Mul, MulAssign}, +}; use cpubits::cfg_if; #[cfg(feature = "zeroize")] use zeroize::Zeroize; -cfg_if! { - if #[cfg(all(target_arch = "aarch64", not(polyval_backend = "soft")))] { - mod autodetect; - mod armv8; - pub use autodetect::Polyval as PolyvalGeneric; - } else if #[cfg(all( - any(target_arch = "x86_64", target_arch = "x86"), - not(polyval_backend = "soft") - ))] { - mod autodetect; - mod x86; - pub use autodetect::Polyval as PolyvalGeneric; - } else { - pub use soft::Polyval as PolyvalGeneric; - } -} - /// An element in POLYVAL's field. /// /// This type represents an element of the binary field GF(2^128) modulo the irreducible polynomial @@ -48,11 +32,62 @@ cfg_if! { #[repr(C, align(16))] // Make ABI and alignment compatible with SIMD registers pub(crate) struct FieldElement([u8; BLOCK_SIZE]); +cfg_if! { + if #[cfg(all(target_arch = "aarch64", not(polyval_backend = "soft")))] { + // aarch64 + mod autodetect; + mod armv8; + pub(crate) use autodetect::{InitToken, detect_intrinsics}; + } else if #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + not(polyval_backend = "soft") + ))] { + // x86/x86-64 + mod autodetect; + mod x86; + pub(crate) use autodetect::{InitToken, detect_intrinsics}; + } else { + // Pure Rust fallback implementation for other targets + use universal_hash::array::{Array, ArraySize}; + + pub(crate) type InitToken = (); + pub(crate) fn detect_intrinsics() -> (InitToken, bool) { + ((), false) + } + + impl FieldElement { + /// Default degree of parallelism, i.e. how many powers of `H` to compute. + pub const DEFAULT_PARALLELISM: usize = 8; + + /// Process an individual block. + pub(crate) fn proc_block( + h: FieldElement, + y: FieldElement, + x: &Block, + _has_intrinsics: InitToken + ) -> FieldElement { + soft::proc_block(h, y, x) + } + + /// Process multiple blocks in parallel. + // TODO(tarcieri): currently just calls `proc_block` for each block on `soft`-only + pub(crate) fn proc_par_blocks( + powers_of_h: &[FieldElement; N], + y: FieldElement, + blocks: &Array, + _has_intrinsics: InitToken + ) -> FieldElement { + soft::proc_par_blocks(powers_of_h, y, blocks) + } + } + } +} + impl FieldElement { /// Compute the first N powers of h, in reverse order. #[inline] #[allow(dead_code)] // We may not use this in some configurations - fn powers_of_h(self) -> [Self; N] { + pub(crate) fn powers_of_h(self) -> [Self; N] { // TODO: improve pipelining by using more square operations? let mut pow = [Self::default(); N]; let mut prev = self; @@ -144,8 +179,7 @@ impl Mul for FieldElement { #[inline] fn mul(self, rhs: Self) -> Self { - let v = soft::karatsuba(self, rhs); - soft::mont_reduce(v) + soft::polymul(self, rhs) } } diff --git a/polyval/src/field_element/armv8.rs b/polyval/src/field_element/armv8.rs index 95f4640..658d7cd 100644 --- a/polyval/src/field_element/armv8.rs +++ b/polyval/src/field_element/armv8.rs @@ -13,154 +13,96 @@ //! For more information about PMULL, see: //! - //! - + #![allow(unsafe_op_in_unsafe_fn)] use super::FieldElement; -use crate::{Block, Key, Tag}; +use crate::Block; use core::{arch::aarch64::*, mem}; -use universal_hash::{ - KeyInit, ParBlocks, Reset, UhfBackend, - array::ArraySize, - common::{BlockSizeUser, KeySizeUser, ParBlocksSizeUser}, - consts::U16, - typenum::{Const, ToUInt, U}, -}; +use universal_hash::array::{Array, ArraySize}; -/// POLYVAL reduction polynomial (`x^128 + x^127 + x^126 + x^121 + 1`) encoded in little-endian -/// GF(2)[x] form with reflected reduction terms arising from folding the upper 128-bits of the -/// product into the lower half during modular reduction. -const POLY: u128 = (1 << 127) | (1 << 126) | (1 << 121) | (1 << 63) | (1 << 62) | (1 << 57); +/// 128-bit SIMD register type. +pub(super) type Simd128 = uint8x16_t; -/// **POLYVAL**: GHASH-like universal hash over GF(2^128). -/// -/// Parameterized on a constant that determines how many -/// blocks to process at once: higher numbers use more memory, -/// and require more time to re-key, but process data significantly -/// faster. +/// Perform carryless multiplication of `y` by `h` and return the result. /// -/// (This constant is not used when acceleration is not enabled.) -#[derive(Clone)] -pub struct Polyval { - /// Powers of H in descending order. - /// - /// (H^N, H^(N-1)...H) - h: [FieldElement; N], - y: FieldElement, -} - -impl KeySizeUser for Polyval { - type KeySize = U16; -} - -impl Polyval { - /// Initialize POLYVAL with the given `H` field element and initial block - pub fn new_with_init_block(h: &Key, init_block: u128) -> Self { - Self { - h: FieldElement::from(h).powers_of_h(), - y: init_block.into(), - } - } -} - -impl KeyInit for Polyval { - /// Initialize POLYVAL with the given `H` field element - fn new(h: &Key) -> Self { - Self::new_with_init_block(h, 0) - } -} - -impl BlockSizeUser for Polyval { - type BlockSize = U16; +/// # Safety +/// It is the caller's responsibility to ensure the host CPU is capable of PMULL and NEON +/// instructions. +// TODO(tarcieri): investigate ordering optimizations and fusions e.g.`fuse-crypto-eor` +#[inline] +#[target_feature(enable = "aes,neon")] +pub(super) unsafe fn polymul(y: Simd128, h: Simd128) -> Simd128 { + let (h, m, l) = karatsuba1(h, y); + let (h, l) = karatsuba2(h, m, l); + mont_reduce(h, l) } -impl ParBlocksSizeUser for Polyval -where - U: ArraySize, - Const: ToUInt, -{ - type ParBlocksSize = U; +/// Process an individual block. +/// +/// # Safety +/// It is the caller's responsibility to ensure the host CPU is capable of PMULL and NEON +/// instructions. +#[inline] +#[target_feature(enable = "aes,neon")] +pub(super) unsafe fn proc_block(h: FieldElement, y: FieldElement, x: &Block) -> FieldElement { + let y = veorq_u8(y.into(), vld1q_u8(x.as_ptr())); + polymul(y, h.into()).into() } -impl UhfBackend for Polyval -where - U: ArraySize, - Const: ToUInt, -{ - fn proc_par_blocks(&mut self, blocks: &ParBlocks) { - unsafe { - let mut h = vdupq_n_u8(0); - let mut m = vdupq_n_u8(0); - let mut l = vdupq_n_u8(0); - - // Note: Manually unrolling this loop did not help in benchmarks. - for i in (0..N).rev() { - let mut x = vld1q_u8(blocks[i].as_ptr()); - if i == 0 { - x = veorq_u8(x, self.y.into()); - } - let y = self.h[i]; - let (hh, mm, ll) = karatsuba1(x, y.into()); - h = veorq_u8(h, hh); - m = veorq_u8(m, mm); - l = veorq_u8(l, ll); +/// Process multiple blocks in parallel. +/// +/// # Safety +/// It is the caller's responsibility to ensure the host CPU is capable of PMULL and NEON +/// instructions. +#[target_feature(enable = "aes,neon")] +pub(super) unsafe fn proc_par_blocks( + powers_of_h: &[FieldElement; N], + y: FieldElement, + blocks: &Array, +) -> FieldElement { + unsafe { + let mut h = vdupq_n_u8(0); + let mut m = vdupq_n_u8(0); + let mut l = vdupq_n_u8(0); + + // Note: Manually unrolling this loop did not help in benchmarks. + for i in (0..N).rev() { + let mut x = vld1q_u8(blocks[i].as_ptr()); + if i == 0 { + x = veorq_u8(x, y.into()); } - - let (h, l) = karatsuba2(h, m, l); - self.y = mont_reduce(h, l).into(); + let (hh, mm, ll) = karatsuba1(x, powers_of_h[i].into()); + h = veorq_u8(h, hh); + m = veorq_u8(m, mm); + l = veorq_u8(l, ll); } - } - - fn proc_block(&mut self, x: &Block) { - unsafe { - let y = veorq_u8(self.y.into(), vld1q_u8(x.as_ptr())); - self.y = polymul(y, self.h[N - 1].into()).into(); - } - } -} -impl Reset for Polyval { - fn reset(&mut self) { - self.y = FieldElement::default(); + let (h, l) = karatsuba2(h, m, l); + mont_reduce(h, l).into() } } -impl Polyval { - /// Get POLYVAL output. - pub(crate) fn finalize(self) -> Tag { - self.y.into() - } -} - -impl From for uint8x16_t { +impl From for Simd128 { #[inline] - fn from(fe: FieldElement) -> uint8x16_t { + fn from(fe: FieldElement) -> Simd128 { unsafe { vld1q_u8(fe.0.as_ptr()) } } } -impl From for FieldElement { +impl From for FieldElement { #[inline] - fn from(fe: uint8x16_t) -> FieldElement { + fn from(fe: Simd128) -> FieldElement { let mut ret = FieldElement::default(); unsafe { vst1q_u8(ret.0.as_mut_ptr(), fe) } ret } } -/// Multiply "y" by "h" and return the result. -// TODO(tarcieri): investigate ordering optimizations and fusions e.g.`fuse-crypto-eor` -#[inline] -#[target_feature(enable = "neon")] -unsafe fn polymul(y: uint8x16_t, h: uint8x16_t) -> uint8x16_t { - let (h, m, l) = karatsuba1(h, y); - let (h, l) = karatsuba2(h, m, l); - mont_reduce(h, l) -} - /// Karatsuba decomposition for `x*y`. #[inline] -#[target_feature(enable = "neon")] -unsafe fn karatsuba1(x: uint8x16_t, y: uint8x16_t) -> (uint8x16_t, uint8x16_t, uint8x16_t) { +#[target_feature(enable = "aes,neon")] +unsafe fn karatsuba1(x: Simd128, y: Simd128) -> (Simd128, Simd128, Simd128) { // First Karatsuba step: decompose x and y. // // (x1*y0 + x0*y1) = (x1+x0) * (y1+y0) + (x1*y1) + (x0*y0) @@ -179,7 +121,7 @@ unsafe fn karatsuba1(x: uint8x16_t, y: uint8x16_t) -> (uint8x16_t, uint8x16_t, u /// Karatsuba combine. #[inline] #[target_feature(enable = "neon")] -unsafe fn karatsuba2(h: uint8x16_t, m: uint8x16_t, l: uint8x16_t) -> (uint8x16_t, uint8x16_t) { +unsafe fn karatsuba2(h: Simd128, m: Simd128, l: Simd128) -> (Simd128, Simd128) { // Second Karatsuba step: combine into a 2n-bit product. // // m0 ^= l0 ^ h0 // = m0^(l0^h0) @@ -218,9 +160,14 @@ unsafe fn karatsuba2(h: uint8x16_t, m: uint8x16_t, l: uint8x16_t) -> (uint8x16_t (x23, x01) } +/// POLYVAL reduction polynomial (`x^128 + x^127 + x^126 + x^121 + 1`) encoded in little-endian +/// GF(2)[x] form with reflected reduction terms arising from folding the upper 128-bits of the +/// product into the lower half during modular reduction. +const POLY: u128 = (1 << 127) | (1 << 126) | (1 << 121) | (1 << 63) | (1 << 62) | (1 << 57); + #[inline] -#[target_feature(enable = "neon")] -unsafe fn mont_reduce(x23: uint8x16_t, x01: uint8x16_t) -> uint8x16_t { +#[target_feature(enable = "aes,neon")] +unsafe fn mont_reduce(x23: Simd128, x01: Simd128) -> Simd128 { // Perform the Montgomery reduction over the 256-bit X. // [A1:A0] = X0 • poly // [B1:B0] = [X0 ⊕ A1 : X1 ⊕ A0] @@ -236,8 +183,8 @@ unsafe fn mont_reduce(x23: uint8x16_t, x01: uint8x16_t) -> uint8x16_t { /// Multiplies the low bits in `a` and `b`. #[inline] -#[target_feature(enable = "neon")] -unsafe fn pmull(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { +#[target_feature(enable = "aes,neon")] +unsafe fn pmull(a: Simd128, b: Simd128) -> Simd128 { mem::transmute(vmull_p64( vgetq_lane_u64(vreinterpretq_u64_u8(a), 0), vgetq_lane_u64(vreinterpretq_u64_u8(b), 0), @@ -246,19 +193,10 @@ unsafe fn pmull(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { /// Multiplies the high bits in `a` and `b`. #[inline] -#[target_feature(enable = "neon")] -unsafe fn pmull2(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { +#[target_feature(enable = "aes,neon")] +unsafe fn pmull2(a: Simd128, b: Simd128) -> Simd128 { mem::transmute(vmull_p64( vgetq_lane_u64(vreinterpretq_u64_u8(a), 1), vgetq_lane_u64(vreinterpretq_u64_u8(b), 1), )) } -// TODO(tarcieri): zeroize support -// #[cfg(feature = "zeroize")] -// impl Drop for Polyval { -// fn drop(&mut self) { -// use zeroize::Zeroize; -// self.h.zeroize(); -// self.y.zeroize(); -// } -// } diff --git a/polyval/src/field_element/autodetect.rs b/polyval/src/field_element/autodetect.rs index 6df8deb..e6721f0 100644 --- a/polyval/src/field_element/autodetect.rs +++ b/polyval/src/field_element/autodetect.rs @@ -1,135 +1,54 @@ //! Autodetection for CPU intrinsics, with fallback to the "soft" backend when //! they are unavailable. -use crate::{Key, Tag, field_element::soft}; -use core::mem::ManuallyDrop; -use universal_hash::{ - KeyInit, Reset, UhfClosure, UniversalHash, - array::ArraySize, - common::{BlockSizeUser, KeySizeUser}, - consts::U16, - typenum::{Const, ToUInt, U}, -}; - #[cfg(target_arch = "aarch64")] use super::armv8 as intrinsics; - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] use super::x86 as intrinsics; +use super::{FieldElement, soft}; +use crate::Block; +use universal_hash::array::{Array, ArraySize}; + #[cfg(target_arch = "aarch64")] cpufeatures::new!(mul_intrinsics, "aes"); // `aes` implies PMULL - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] cpufeatures::new!(mul_intrinsics, "pclmulqdq"); -/// **POLYVAL**: GHASH-like universal hash over GF(2^128). -/// -/// Parameterized on a constant that determines how many -/// blocks to process at once: higher numbers use more memory, -/// and require more time to re-key, but process data significantly -/// faster. -/// -/// (This constant is not used when acceleration is not enabled.) -pub struct Polyval { - inner: Inner, - token: mul_intrinsics::InitToken, -} - -union Inner { - intrinsics: ManuallyDrop>, - soft: ManuallyDrop>, -} - -impl KeySizeUser for Polyval { - type KeySize = U16; -} - -impl Polyval { - /// Initialize POLYVAL with the given `H` field element and initial block - #[must_use] - pub fn new_with_init_block(h: &Key, init_block: u128) -> Self { - let (token, has_intrinsics) = mul_intrinsics::init_get(); - - let inner = if has_intrinsics { - Inner { - intrinsics: ManuallyDrop::new(intrinsics::Polyval::new_with_init_block( - h, init_block, - )), - } - } else { - Inner { - soft: ManuallyDrop::new(soft::Polyval::new_with_init_block(h, init_block)), - } - }; - - Self { inner, token } - } -} - -impl KeyInit for Polyval { - /// Initialize POLYVAL with the given `H` field element - fn new(h: &Key) -> Self { - Self::new_with_init_block(h, 0) - } -} - -impl BlockSizeUser for Polyval { - type BlockSize = U16; -} - -impl UniversalHash for Polyval -where - U: ArraySize, - Const: ToUInt, -{ - fn update_with_backend(&mut self, f: impl UhfClosure) { - unsafe { - if self.token.get() { - f.call(&mut *self.inner.intrinsics); - } else { - f.call(&mut *self.inner.soft); - } - } - } - - /// Get POLYVAL result (i.e. computed `S` field element) - fn finalize(self) -> Tag { - unsafe { - if self.token.get() { - ManuallyDrop::into_inner(self.inner.intrinsics).finalize() - } else { - ManuallyDrop::into_inner(self.inner.soft).finalize() - } - } - } -} - -impl Clone for Polyval { - fn clone(&self) -> Self { - let inner = if self.token.get() { - Inner { - intrinsics: ManuallyDrop::new(unsafe { (*self.inner.intrinsics).clone() }), - } +pub(crate) use mul_intrinsics::{InitToken, init_get as detect_intrinsics}; + +impl FieldElement { + /// Default degree of parallelism, i.e. how many powers of `H` to compute. + pub const DEFAULT_PARALLELISM: usize = 8; + + /// Process an individual block. + pub(crate) fn proc_block( + h: FieldElement, + y: FieldElement, + block: &Block, + has_intrinsics: InitToken, + ) -> FieldElement { + if has_intrinsics.get() { + // SAFETY: we have checked the CPU has the necessary intrinsics above + unsafe { intrinsics::proc_block(h, y, block) } } else { - Inner { - soft: ManuallyDrop::new(unsafe { (*self.inner.soft).clone() }), - } - }; - - Self { - inner, - token: self.token, + soft::proc_block(h, y, block) } } -} -impl Reset for Polyval { - fn reset(&mut self) { - if self.token.get() { - unsafe { (*self.inner.intrinsics).reset() } + /// Process multiple blocks in parallel. + pub(crate) fn proc_par_blocks( + powers_of_h: &[FieldElement; N], + y: FieldElement, + blocks: &Array, + has_intrinsics: InitToken, + ) -> FieldElement { + if has_intrinsics.get() { + // SAFETY: we have checked the CPU has the necessary intrinsics above + unsafe { intrinsics::proc_par_blocks(powers_of_h, y, blocks) } } else { - unsafe { (*self.inner.soft).reset() } + // TODO(tarcieri): currently just calls `proc_block` for each block on `soft`-only + soft::proc_par_blocks(powers_of_h, y, blocks) } } } diff --git a/polyval/src/field_element/soft.rs b/polyval/src/field_element/soft.rs index 7daa031..ec76d3c 100644 --- a/polyval/src/field_element/soft.rs +++ b/polyval/src/field_element/soft.rs @@ -28,99 +28,40 @@ cpubits::cpubits! { } } -pub(super) use soft_impl::{karatsuba, mont_reduce}; - -use super::FieldElement; -use crate::{Block, Key, Tag}; +use crate::Block; +use crate::field_element::FieldElement; use core::{ num::Wrapping, ops::{BitAnd, BitOr, BitXor, Mul, Shl}, }; -use universal_hash::{ - KeyInit, Reset, UhfBackend, UhfClosure, UniversalHash, - common::{BlockSizeUser, KeySizeUser, ParBlocksSizeUser}, - consts::{U1, U16}, -}; - -#[cfg(feature = "zeroize")] -use zeroize::Zeroize; - -/// **POLYVAL**: GHASH-like universal hash over GF(2^128). -/// -/// Parameterized on a constant that determines how many -/// blocks to process at once: higher numbers use more memory, -/// and require more time to re-key, but process data significantly -/// faster. -/// -/// (This constant is not used when acceleration is not enabled.) -#[derive(Clone)] -pub struct Polyval { - /// GF(2^128) field element input blocks are multiplied by - h: FieldElement, - - /// Field element representing the computed universal hash - y: FieldElement, -} - -impl Polyval { - /// Initialize POLYVAL with the given `H` field element and initial block - pub fn new_with_init_block(h: &Key, init_block: u128) -> Self { - Self { - h: FieldElement::from(*h), - y: init_block.into(), - } - } -} - -impl KeySizeUser for Polyval { - type KeySize = U16; -} - -impl KeyInit for Polyval { - /// Initialize POLYVAL with the given `H` field element - fn new(h: &Key) -> Self { - Self::new_with_init_block(h, 0) - } -} - -impl BlockSizeUser for Polyval { - type BlockSize = U16; -} - -impl ParBlocksSizeUser for Polyval { - type ParBlocksSize = U1; -} - -impl UhfBackend for Polyval { - fn proc_block(&mut self, x: &Block) { - let x = FieldElement::from(x); - self.y = (self.y + x) * self.h; - } -} - -impl UniversalHash for Polyval { - fn update_with_backend(&mut self, f: impl UhfClosure) { - f.call(self); - } +use soft_impl::{karatsuba, mont_reduce}; +use universal_hash::array::{Array, ArraySize}; - /// Get POLYVAL result (i.e. computed `S` field element) - fn finalize(self) -> Tag { - self.y.into() - } +/// Perform carryless multiplication of `y` by `h` and return the result. +#[inline] +pub(super) fn polymul(y: FieldElement, h: FieldElement) -> FieldElement { + let v = karatsuba(y, h); + mont_reduce(v) } -impl Reset for Polyval { - fn reset(&mut self) { - self.y = FieldElement::default(); - } +/// Process an individual block. +// TODO(tarcieri): implement `proc_par_blocks` for soft backend? +pub(super) fn proc_block(h: FieldElement, y: FieldElement, x: &Block) -> FieldElement { + let x = FieldElement::from(x); + polymul(y + x, h) } -#[cfg(feature = "zeroize")] -impl Drop for Polyval { - fn drop(&mut self) { - self.h.zeroize(); - self.y.zeroize(); +/// Process multiple blocks. +// TODO(tarcieri): optimized implementation? +pub(super) fn proc_par_blocks( + powers_of_h: &[FieldElement; N], + mut y: FieldElement, + blocks: &Array, +) -> FieldElement { + for block in blocks.iter() { + y = proc_block(powers_of_h[N - 1], y, block); } + y } /// Multiplication in GF(2)[X], implemented generically and wrapped as `bmul32` and `bmul64`. diff --git a/polyval/src/field_element/soft/soft32.rs b/polyval/src/field_element/soft/soft32.rs index 5b172ce..073653f 100644 --- a/polyval/src/field_element/soft/soft32.rs +++ b/polyval/src/field_element/soft/soft32.rs @@ -56,7 +56,7 @@ impl FieldElement { /// multiplications, hence nine 32x32 multiplications. With the bit-reversal trick, we have to /// perform 18 32x32 multiplications. #[inline] -pub(crate) fn karatsuba(h: FieldElement, y: FieldElement) -> [u32; 8] { +pub(super) fn karatsuba(h: FieldElement, y: FieldElement) -> [u32; 8] { let hw = h.to_u32x4(); let yw = y.to_u32x4(); let hwr = [ @@ -149,7 +149,7 @@ fn bmul32(x: u32, y: u32) -> u32 { /// /// This is closely related to GHASH reduction but the bit order is reversed in POLYVAL. #[inline] -pub(crate) fn mont_reduce(mut zw: [u32; 8]) -> FieldElement { +pub(super) fn mont_reduce(mut zw: [u32; 8]) -> FieldElement { for i in 0..4 { let lw = zw[i]; zw[i + 4] ^= lw ^ (lw >> 1) ^ (lw >> 2) ^ (lw >> 7); diff --git a/polyval/src/field_element/x86.rs b/polyval/src/field_element/x86.rs index 41008df..c20a727 100644 --- a/polyval/src/field_element/x86.rs +++ b/polyval/src/field_element/x86.rs @@ -1,8 +1,10 @@ //! Intel `CLMUL`-accelerated implementation for modern x86/x86_64 CPUs //! (i.e. Intel Sandy Bridge-compatible or newer) //! -//! Based on implementation by Eric Lagergren -//! at . +//! Based on implementation by Eric Lagergren at +//! . + +#![allow(unsafe_op_in_unsafe_fn, unused_unsafe)] #[cfg(target_arch = "x86")] use core::arch::x86::*; @@ -10,183 +12,85 @@ use core::arch::x86::*; use core::arch::x86_64::*; use super::FieldElement; -use crate::{Block, Key, Tag}; +use crate::Block; use core::ptr; -use universal_hash::{ - KeyInit, ParBlocks, Reset, UhfBackend, - array::ArraySize, - common::{BlockSizeUser, KeySizeUser, ParBlocksSizeUser}, - consts::U16, - typenum::{Const, ToUInt, U}, -}; +use universal_hash::array::{Array, ArraySize}; + +/// 128-bit SIMD register type. +pub(super) type Simd128 = __m128i; -/// **POLYVAL**: GHASH-like universal hash over GF(2^128). +/// Perform carryless multiplication of `y` by `h` and return the result. /// -/// Parameterized on a constant that determines how many -/// blocks to process at once: higher numbers use more memory, -/// and require more time to re-key, but process data significantly -/// faster. +/// # Safety /// -/// (This constant is not used when acceleration is not enabled.) -#[derive(Clone)] -pub struct Polyval { - /// Powers of H in descending order. - /// - /// (H^N, H^(N-1)...H) - h: [FieldElement; N], - y: FieldElement, -} - -impl KeySizeUser for Polyval { - type KeySize = U16; -} - -impl Polyval { - /// Initialize POLYVAL with the given `H` field element and initial block - pub fn new_with_init_block(h: &Key, init_block: u128) -> Self { - Self { - h: FieldElement::from(h).powers_of_h(), - y: init_block.into(), - } - } -} - -impl KeyInit for Polyval { - /// Initialize POLYVAL with the given `H` field element - fn new(h: &Key) -> Self { - Self::new_with_init_block(h, 0) - } -} - -impl BlockSizeUser for Polyval { - type BlockSize = U16; -} - -impl ParBlocksSizeUser for Polyval -where - U: ArraySize, - Const: ToUInt, -{ - type ParBlocksSize = U; -} - -impl UhfBackend for Polyval -where - U: ArraySize, - Const: ToUInt, -{ - fn proc_par_blocks(&mut self, blocks: &ParBlocks) { - unsafe { - self.mul_par_blocks(blocks); - } - } - - fn proc_block(&mut self, x: &Block) { - unsafe { - self.mul(x); - } +/// The SSE2 and pclmulqdq target features must be enabled. +#[inline] +#[target_feature(enable = "pclmulqdq,sse2")] +pub(super) unsafe fn polymul(x: Simd128, y: Simd128) -> Simd128 { + let (h, m, l) = unsafe { karatsuba1(x, y) }; + let (h, l) = unsafe { karatsuba2(h, m, l) }; + unsafe { + mont_reduce(h, l) // d } } -impl Polyval { - /// Get Polyval output - pub(crate) fn finalize(self) -> Tag { - unsafe { core::mem::transmute(self.y) } - } +/// Perform carryless multiplication of `y` by `h` and return the result. +#[inline] +#[target_feature(enable = "pclmulqdq")] +pub(super) unsafe fn proc_block(h: FieldElement, y: FieldElement, x: &Block) -> FieldElement { + let x = unsafe { _mm_loadu_si128(x.as_ptr().cast()) }; + unsafe { polymul(_mm_xor_si128(y.into(), x), h.into()).into() } } -impl Polyval -where - U: ArraySize, - Const: ToUInt, -{ - #[inline] - #[target_feature(enable = "pclmulqdq")] - unsafe fn mul_par_blocks(&mut self, blocks: &ParBlocks) { - unsafe { - let mut h = _mm_setzero_si128(); - let mut m = _mm_setzero_si128(); - let mut l = _mm_setzero_si128(); - - // Note: Manually unrolling this loop did not help in benchmarks. - for i in (0..N).rev() { - let mut x = _mm_loadu_si128(blocks[i].as_ptr().cast()); - if i == 0 { - x = _mm_xor_si128(x, self.y.into()); - } - let y = self.h[i]; - let (hh, mm, ll) = karatsuba1(x, y.into()); - h = _mm_xor_si128(h, hh); - m = _mm_xor_si128(m, mm); - l = _mm_xor_si128(l, ll); +#[inline] +#[target_feature(enable = "pclmulqdq")] +pub(super) unsafe fn proc_par_blocks( + powers_of_h: &[FieldElement; N], + y: FieldElement, + blocks: &Array, +) -> FieldElement { + unsafe { + let mut h = _mm_setzero_si128(); + let mut m = _mm_setzero_si128(); + let mut l = _mm_setzero_si128(); + + // Note: Manually unrolling this loop did not help in benchmarks. + for i in (0..N).rev() { + let mut x = _mm_loadu_si128(blocks[i].as_ptr().cast()); + if i == 0 { + x = _mm_xor_si128(x, y.into()); } - - let (h, l) = karatsuba2(h, m, l); - self.y = mont_reduce(h, l).into(); + let (hh, mm, ll) = karatsuba1(x, powers_of_h[i].into()); + h = _mm_xor_si128(h, hh); + m = _mm_xor_si128(m, mm); + l = _mm_xor_si128(l, ll); } - } - - #[inline] - #[target_feature(enable = "pclmulqdq")] - #[allow(unsafe_op_in_unsafe_fn)] - unsafe fn mul(&mut self, x: &Block) { - let x = _mm_loadu_si128(x.as_ptr().cast()); - self.y = polymul(_mm_xor_si128(self.y.into(), x), self.h[N - 1].into()).into(); - } -} - -impl Reset for Polyval { - fn reset(&mut self) { - self.y = FieldElement::default(); - } -} -#[cfg(feature = "zeroize")] -impl Drop for Polyval { - fn drop(&mut self) { - use zeroize::Zeroize; - self.h.zeroize(); - self.y.zeroize(); + let (h, l) = karatsuba2(h, m, l); + mont_reduce(h, l).into() } } -impl From for __m128i { +impl From for Simd128 { #[inline] - fn from(fe: FieldElement) -> __m128i { + fn from(fe: FieldElement) -> Simd128 { unsafe { _mm_loadu_si128(fe.0.as_ptr().cast()) } } } -impl From<__m128i> for FieldElement { +impl From for FieldElement { #[inline] - fn from(fe: __m128i) -> FieldElement { + fn from(fe: Simd128) -> FieldElement { let mut ret = FieldElement::default(); unsafe { _mm_store_si128(ret.0.as_mut_ptr().cast(), fe) } ret } } -/// # Safety -/// -/// The SSE2 and pclmulqdq target features must be enabled. -#[inline] -#[target_feature(enable = "sse2,pclmulqdq")] -#[allow(unused_unsafe)] -#[allow(clippy::undocumented_unsafe_blocks, reason = "Too many unsafe blocks.")] -unsafe fn polymul(x: __m128i, y: __m128i) -> __m128i { - let (h, m, l) = unsafe { karatsuba1(x, y) }; - let (h, l) = unsafe { karatsuba2(h, m, l) }; - unsafe { - mont_reduce(h, l) // d - } -} - /// Karatsuba decomposition for `x*y`. #[inline] -#[target_feature(enable = "sse2,pclmulqdq")] -#[allow(unused_unsafe)] -#[allow(clippy::undocumented_unsafe_blocks, reason = "Too many unsafe blocks.")] -unsafe fn karatsuba1(x: __m128i, y: __m128i) -> (__m128i, __m128i, __m128i) { +#[target_feature(enable = "pclmulqdq,sse2")] +unsafe fn karatsuba1(x: Simd128, y: Simd128) -> (Simd128, Simd128, Simd128) { // First Karatsuba step: decompose x and y. // // (x1*y0 + x0*y1) = (x1+x0) * (y1+x0) + (x1*y1) + (x0*y0) @@ -194,22 +98,20 @@ unsafe fn karatsuba1(x: __m128i, y: __m128i) -> (__m128i, __m128i, __m128i) { // // m = x.hi^x.lo * y.hi^y.lo let m = unsafe { - pmull( + clmul( _mm_xor_si128(x, _mm_shuffle_epi32(x, 0xee)), _mm_xor_si128(y, _mm_shuffle_epi32(y, 0xee)), ) }; - let h = unsafe { pmull2(y, x) }; // h = x.hi * y.hi - let l = unsafe { pmull(y, x) }; // l = x.lo * y.lo + let h = unsafe { clmul2(y, x) }; // h = x.hi * y.hi + let l = unsafe { clmul(y, x) }; // l = x.lo * y.lo (h, m, l) } /// Karatsuba combine. #[inline] -#[target_feature(enable = "sse2,pclmulqdq")] -#[allow(unused_unsafe)] -#[allow(clippy::undocumented_unsafe_blocks, reason = "Too many unsafe blocks.")] -unsafe fn karatsuba2(h: __m128i, m: __m128i, l: __m128i) -> (__m128i, __m128i) { +#[target_feature(enable = "pclmulqdq,sse2")] +unsafe fn karatsuba2(h: Simd128, m: Simd128, l: Simd128) -> (Simd128, Simd128) { // Second Karatsuba step: combine into a 2n-bit product. // // m0 ^= l0 ^ h0 // = m0^(l0^h0) @@ -242,21 +144,20 @@ unsafe fn karatsuba2(h: __m128i, m: __m128i, l: __m128i) -> (__m128i, __m128i) { // {m0^l1^h0^l0, l0} let x01 = unsafe { _mm_unpacklo_epi64(l, t) }; - // {h1, m1^h0^h1^l1} let x23 = unsafe { _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(h), _mm_castsi128_ps(t))) }; (x23, x01) } -/// # Safety +/// Perform Montgomery reduction of the 256-bit product into 128-bits. /// -/// The SSE2 and pclmulqdq target features must be enavled. +/// # Safety +/// It is the caller's responsibility to ensure the host CPU is capable of CLMUL and SSE2 +/// instructions. #[inline] -#[target_feature(enable = "sse2,pclmulqdq")] -#[allow(unused_unsafe)] -#[allow(clippy::undocumented_unsafe_blocks, reason = "Too many unsafe blocks.")] -unsafe fn mont_reduce(x23: __m128i, x01: __m128i) -> __m128i { +#[target_feature(enable = "pclmulqdq,sse2")] +unsafe fn mont_reduce(x23: Simd128, x01: Simd128) -> Simd128 { // Perform the Montgomery reduction over the 256-bit X. // [A1:A0] = X0 • poly // [B1:B0] = [X0 ⊕ A1 : X1 ⊕ A0] @@ -265,21 +166,20 @@ unsafe fn mont_reduce(x23: __m128i, x01: __m128i) -> __m128i { // Output: [D1 ⊕ X3 : D0 ⊕ X2] static POLY: u128 = (1 << 127) | (1 << 126) | (1 << 121) | (1 << 63) | (1 << 62) | (1 << 57); let poly = unsafe { _mm_loadu_si128(ptr::addr_of!(POLY).cast()) }; - let a = unsafe { pmull(x01, poly) }; + let a = unsafe { clmul(x01, poly) }; let b = unsafe { _mm_xor_si128(x01, _mm_shuffle_epi32(a, 0x4e)) }; - let c = unsafe { pmull2(b, poly) }; + let c = unsafe { clmul2(b, poly) }; unsafe { _mm_xor_si128(x23, _mm_xor_si128(c, b)) } } /// Multiplies the low bits in `a` and `b`. /// /// # Safety -/// -/// The SSE2 and pclmulqdq target features must be enabled. +/// It is the caller's responsibility to ensure the host CPU is capable of CLMUL and SSE2 +/// instructions. #[inline] -#[allow(unused_unsafe)] -#[target_feature(enable = "sse2,pclmulqdq")] -unsafe fn pmull(a: __m128i, b: __m128i) -> __m128i { +#[target_feature(enable = "pclmulqdq,sse2")] +unsafe fn clmul(a: Simd128, b: Simd128) -> Simd128 { // SAFETY: This requires the `sse2` and `pclmulqdq` features // which we have. unsafe { _mm_clmulepi64_si128(a, b, 0x00) } @@ -288,12 +188,11 @@ unsafe fn pmull(a: __m128i, b: __m128i) -> __m128i { /// Multiplies the high bits in `a` and `b`. /// /// # Safety -/// -/// The SSE2 and pclmulqdq target features must be enavled. +/// It is the caller's responsibility to ensure the host CPU is capable of CLMUL and SSE2 +/// instructions. #[inline] -#[allow(unused_unsafe)] -#[target_feature(enable = "sse2,pclmulqdq")] -unsafe fn pmull2(a: __m128i, b: __m128i) -> __m128i { +#[target_feature(enable = "pclmulqdq,sse2")] +unsafe fn clmul2(a: Simd128, b: Simd128) -> Simd128 { // SAFETY: This requires the `sse2` and `pclmulqdq` features // which we have. unsafe { _mm_clmulepi64_si128(a, b, 0x11) } diff --git a/polyval/src/lib.rs b/polyval/src/lib.rs index eb4ce04..2a77c23 100644 --- a/polyval/src/lib.rs +++ b/polyval/src/lib.rs @@ -9,14 +9,21 @@ mod field_element; mod mulx; -pub use crate::{field_element::PolyvalGeneric, mulx::mulx}; +pub use crate::mulx::mulx; pub use universal_hash; -impl core::fmt::Debug for PolyvalGeneric { - fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> { - write!(f, "PolyvalGeneric<{}> {{ ... }}", N) - } -} +use core::fmt::{self, Debug}; +use field_element::{FieldElement, InitToken, detect_intrinsics}; +use universal_hash::{ + KeyInit, ParBlocks, Reset, UhfBackend, UhfClosure, UniversalHash, + array::{Array, ArraySize}, + common::{BlockSizeUser, KeySizeUser, ParBlocksSizeUser}, + consts::U16, + typenum::{Const, ToUInt, U}, +}; + +#[cfg(feature = "zeroize")] +use zeroize::Zeroize; /// Size of a POLYVAL block in bytes pub const BLOCK_SIZE: usize = 16; @@ -25,13 +32,133 @@ pub const BLOCK_SIZE: usize = 16; pub const KEY_SIZE: usize = 16; /// POLYVAL keys (16-bytes) -pub type Key = universal_hash::Key; +pub type Key = Array; /// POLYVAL blocks (16-bytes) -pub type Block = universal_hash::Block; +pub type Block = Array; /// POLYVAL tags (16-bytes) -pub type Tag = universal_hash::Block; +pub type Tag = Array; + +/// **POLYVAL**: GHASH-like universal hash over GF(2^128). +/// +/// This type alias uses the default amount of parallelism for the target (`8` for `aarch64`/`x86`, +/// `1` for other targets using a pure Rust fallback implementation). +pub type Polyval = PolyvalGeneric<{ FieldElement::DEFAULT_PARALLELISM }>; /// **POLYVAL**: GHASH-like universal hash over GF(2^128). -pub type Polyval = PolyvalGeneric<8>; +/// +/// Parameterized on a constant that determines how many blocks to process at once: higher numbers +/// use more memory, and require more time to re-key, but process data significantly faster. +/// +/// (This constant is not used when acceleration is not enabled.) +#[derive(Clone)] +pub struct PolyvalGeneric { + /// Powers of H in descending order. + /// + /// (H^N, H^(N-1)...H) + powers_of_h: [FieldElement; N], + + /// Accumulator for POLYVAL computation. + y: FieldElement, + + /// Token for accessing CPU feature detection results. + has_intrinsics: InitToken, +} + +impl PolyvalGeneric { + /// Initialize POLYVAL with the given `H` field element. + #[must_use] + pub fn new(h: &Key) -> Self { + Self::new_with_init_block(h, 0) + } + + /// Initialize POLYVAL with the given `H` field element and initial block. + #[must_use] + pub fn new_with_init_block(h: &Key, init_block: u128) -> Self { + let (token, _has_intrinsics) = detect_intrinsics(); + Self { + powers_of_h: FieldElement::from(h).powers_of_h(), + y: init_block.into(), + has_intrinsics: token, + } + } + + /// Get `h` from the powers-of-`H`. + #[inline] + pub(crate) fn h(&self) -> FieldElement { + self.powers_of_h[N - 1] + } +} + +impl KeyInit for PolyvalGeneric { + fn new(h: &Key) -> Self { + Self::new(h) + } +} + +impl KeySizeUser for PolyvalGeneric { + type KeySize = U16; +} + +impl BlockSizeUser for PolyvalGeneric { + type BlockSize = U16; +} + +impl ParBlocksSizeUser for PolyvalGeneric +where + U: ArraySize, + Const: ToUInt, +{ + type ParBlocksSize = U; +} + +impl UniversalHash for PolyvalGeneric +where + U: ArraySize, + Const: ToUInt, +{ + fn update_with_backend(&mut self, f: impl UhfClosure) { + f.call(self); + } + + fn finalize(self) -> Tag { + self.y.into() + } +} + +#[allow(clippy::unit_arg)] +impl UhfBackend for PolyvalGeneric +where + U: ArraySize, + Const: ToUInt, +{ + fn proc_block(&mut self, block: &Block) { + self.y = FieldElement::proc_block(self.h(), self.y, block, self.has_intrinsics); + } + + fn proc_par_blocks(&mut self, blocks: &ParBlocks) { + self.y = + FieldElement::proc_par_blocks(&self.powers_of_h, self.y, blocks, self.has_intrinsics); + } +} + +impl Reset for PolyvalGeneric { + fn reset(&mut self) { + self.y = FieldElement::default(); + } +} + +#[cfg(feature = "zeroize")] +impl Drop for PolyvalGeneric { + fn drop(&mut self) { + self.powers_of_h.zeroize(); + self.y.zeroize(); + } +} + +impl Debug for PolyvalGeneric { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + write!(f, "PolyvalGeneric<{}> {{ ... }}", N) + } +}