From 0530aae7a642a0d04075ba239ac9808844da3346 Mon Sep 17 00:00:00 2001 From: Tony Arcieri Date: Tue, 27 Jan 2026 11:24:51 -0700 Subject: [PATCH] polyval: use intrinsics when computing powers of `H` The only time we actually use powers-of-`H` is when we have intrinsics anyway, so we might as well make use of the intrinsics when computing them. This factors the computation into the `autodetect` module with autodetection support and a fallback to a stub implementation for the `soft` backend which only initializes and access the last element (which is layout-compatible with the intrinsics implementation, if we ever decide to add the requisite support to the soft backend) --- polyval/src/field_element.rs | 38 +++++++++---------------- polyval/src/field_element/autodetect.rs | 27 ++++++++++++++++-- polyval/src/field_element/soft.rs | 11 +++++++ polyval/src/lib.rs | 8 +++--- 4 files changed, 53 insertions(+), 31 deletions(-) diff --git a/polyval/src/field_element.rs b/polyval/src/field_element.rs index a42913f..8c275fa 100644 --- a/polyval/src/field_element.rs +++ b/polyval/src/field_element.rs @@ -37,7 +37,7 @@ cfg_if! { // aarch64 mod autodetect; mod armv8; - pub(crate) use autodetect::{InitToken, detect_intrinsics}; + pub(crate) use autodetect::{InitToken, has_intrinsics}; } else if #[cfg(all( any(target_arch = "x86_64", target_arch = "x86"), not(polyval_backend = "soft") @@ -45,13 +45,13 @@ cfg_if! { // x86/x86-64 mod autodetect; mod x86; - pub(crate) use autodetect::{InitToken, detect_intrinsics}; + pub(crate) use autodetect::{InitToken, has_intrinsics}; } else { - // Pure Rust fallback implementation for other targets + // "soft" fallback implementation for other targets written in pure Rust use universal_hash::array::{Array, ArraySize}; pub(crate) type InitToken = (); - pub(crate) fn detect_intrinsics() -> (InitToken, bool) { + pub(crate) fn has_intrinsics() -> (InitToken, bool) { ((), false) } @@ -59,6 +59,16 @@ cfg_if! { /// Default degree of parallelism, i.e. how many powers of `H` to compute. pub const DEFAULT_PARALLELISM: usize = 8; + /// Stub implementation that works with `Polyval::h` even though we don't support + /// `proc_par_blocks`. + #[inline] + pub(crate) fn powers_of_h( + self, + _has_intrinsics: InitToken + ) -> [Self; N] { + soft::powers_of_h(self) + } + /// Process an individual block. pub(crate) fn proc_block( h: FieldElement, @@ -83,26 +93,6 @@ cfg_if! { } } -impl FieldElement { - /// Compute the first N powers of h, in reverse order. - #[inline] - #[allow(dead_code)] // We may not use this in some configurations - pub(crate) fn powers_of_h(self) -> [Self; N] { - // TODO: improve pipelining by using more square operations? - let mut pow = [Self::default(); N]; - let mut prev = self; - - for (i, v) in pow.iter_mut().rev().enumerate() { - *v = self; - if i > 0 { - *v *= prev; - } - prev = *v; - } - pow - } -} - impl Debug for FieldElement { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "FieldElement(")?; diff --git a/polyval/src/field_element/autodetect.rs b/polyval/src/field_element/autodetect.rs index e6721f0..2e3f6fa 100644 --- a/polyval/src/field_element/autodetect.rs +++ b/polyval/src/field_element/autodetect.rs @@ -11,16 +11,37 @@ use crate::Block; use universal_hash::array::{Array, ArraySize}; #[cfg(target_arch = "aarch64")] -cpufeatures::new!(mul_intrinsics, "aes"); // `aes` implies PMULL +cpufeatures::new!(detect_intrinsics, "aes"); // `aes` implies PMULL #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -cpufeatures::new!(mul_intrinsics, "pclmulqdq"); +cpufeatures::new!(detect_intrinsics, "pclmulqdq"); -pub(crate) use mul_intrinsics::{InitToken, init_get as detect_intrinsics}; +pub(crate) use detect_intrinsics::{InitToken, init_get as has_intrinsics}; impl FieldElement { /// Default degree of parallelism, i.e. how many powers of `H` to compute. pub const DEFAULT_PARALLELISM: usize = 8; + /// Compute the first N powers of h, in reverse order. + #[inline] + pub(crate) fn powers_of_h(self, has_intrinsics: InitToken) -> [Self; N] { + if has_intrinsics.get() { + // TODO: improve pipelining by using more square operations? + let mut pow = [Self::default(); N]; + let mut prev = self; + + for (i, v) in pow.iter_mut().rev().enumerate() { + *v = self; + if i > 0 { + *v = unsafe { intrinsics::polymul((*v).into(), prev.into()) }.into(); + } + prev = *v; + } + pow + } else { + soft::powers_of_h(self) + } + } + /// Process an individual block. pub(crate) fn proc_block( h: FieldElement, diff --git a/polyval/src/field_element/soft.rs b/polyval/src/field_element/soft.rs index ec76d3c..c49a937 100644 --- a/polyval/src/field_element/soft.rs +++ b/polyval/src/field_element/soft.rs @@ -37,6 +37,15 @@ use core::{ use soft_impl::{karatsuba, mont_reduce}; use universal_hash::array::{Array, ArraySize}; +/// Stub implementation which only makes `PolyvalGeneric::h` work. +// TODO(tarcieri): actually implement this optimization? +#[inline] +pub(super) fn powers_of_h(h: FieldElement) -> [FieldElement; N] { + let mut ret = [FieldElement::default(); N]; + ret[N - 1] = h; + ret +} + /// Perform carryless multiplication of `y` by `h` and return the result. #[inline] pub(super) fn polymul(y: FieldElement, h: FieldElement) -> FieldElement { @@ -46,6 +55,7 @@ pub(super) fn polymul(y: FieldElement, h: FieldElement) -> FieldElement { /// Process an individual block. // TODO(tarcieri): implement `proc_par_blocks` for soft backend? +#[inline] pub(super) fn proc_block(h: FieldElement, y: FieldElement, x: &Block) -> FieldElement { let x = FieldElement::from(x); polymul(y + x, h) @@ -53,6 +63,7 @@ pub(super) fn proc_block(h: FieldElement, y: FieldElement, x: &Block) -> FieldEl /// Process multiple blocks. // TODO(tarcieri): optimized implementation? +#[inline] pub(super) fn proc_par_blocks( powers_of_h: &[FieldElement; N], mut y: FieldElement, diff --git a/polyval/src/lib.rs b/polyval/src/lib.rs index 2a77c23..2baf7dd 100644 --- a/polyval/src/lib.rs +++ b/polyval/src/lib.rs @@ -13,7 +13,7 @@ pub use crate::mulx::mulx; pub use universal_hash; use core::fmt::{self, Debug}; -use field_element::{FieldElement, InitToken, detect_intrinsics}; +use field_element::{FieldElement, InitToken, has_intrinsics}; use universal_hash::{ KeyInit, ParBlocks, Reset, UhfBackend, UhfClosure, UniversalHash, array::{Array, ArraySize}, @@ -76,11 +76,11 @@ impl PolyvalGeneric { /// Initialize POLYVAL with the given `H` field element and initial block. #[must_use] pub fn new_with_init_block(h: &Key, init_block: u128) -> Self { - let (token, _has_intrinsics) = detect_intrinsics(); + let has_intrinsics = has_intrinsics().0; Self { - powers_of_h: FieldElement::from(h).powers_of_h(), + powers_of_h: FieldElement::from(h).powers_of_h(has_intrinsics), y: init_block.into(), - has_intrinsics: token, + has_intrinsics, } }