RustCrypto · tarcieri · Jan 27, 2026 · Jan 27, 2026
diff --git a/polyval/benches/polyval.rs b/polyval/benches/polyval.rs
@@ -4,10 +4,7 @@
 
 extern crate test;
 
-use polyval::{
-    Polyval,
-    universal_hash::{KeyInit, UniversalHash},
-};
+use polyval::{Polyval, universal_hash::UniversalHash};
 use test::Bencher;
 
 // TODO(tarcieri): move this into the `universal-hash` crate

diff --git a/polyval/src/field_element.rs b/polyval/src/field_element.rs
@@ -3,31 +3,15 @@
 mod soft;
 
 use crate::{BLOCK_SIZE, Block};
-use core::fmt;
-use core::fmt::Debug;
-use core::ops::{Add, Mul, MulAssign};
+use core::{
+    fmt::{self, Debug},
+    ops::{Add, Mul, MulAssign},
+};
 use cpubits::cfg_if;
 
 #[cfg(feature = "zeroize")]
 use zeroize::Zeroize;
 
-cfg_if! {
-    if #[cfg(all(target_arch = "aarch64", not(polyval_backend = "soft")))] {
-        mod autodetect;
-        mod armv8;
-        pub use autodetect::Polyval as PolyvalGeneric;
-    } else if #[cfg(all(
-        any(target_arch = "x86_64", target_arch = "x86"),
-        not(polyval_backend = "soft")
-    ))] {
-        mod autodetect;
-        mod x86;
-        pub use autodetect::Polyval as PolyvalGeneric;
-    } else {
-        pub use soft::Polyval as PolyvalGeneric;
-    }
-}
-
 /// An element in POLYVAL's field.
 ///
 /// This type represents an element of the binary field GF(2^128) modulo the irreducible polynomial
@@ -48,11 +32,62 @@ cfg_if! {
 #[repr(C, align(16))] // Make ABI and alignment compatible with SIMD registers
 pub(crate) struct FieldElement([u8; BLOCK_SIZE]);
 
+cfg_if! {
+    if #[cfg(all(target_arch = "aarch64", not(polyval_backend = "soft")))] {
+        // aarch64
+        mod autodetect;
+        mod armv8;
+        pub(crate) use autodetect::{InitToken, detect_intrinsics};
+    } else if #[cfg(all(
+        any(target_arch = "x86_64", target_arch = "x86"),
+        not(polyval_backend = "soft")
+    ))] {
+        // x86/x86-64
+        mod autodetect;
+        mod x86;
+        pub(crate) use autodetect::{InitToken, detect_intrinsics};
+    } else {
+        // Pure Rust fallback implementation for other targets
+        use universal_hash::array::{Array, ArraySize};
+
+        pub(crate) type InitToken = ();
+        pub(crate) fn detect_intrinsics() -> (InitToken, bool) {
+            ((), false)
+        }
+
+        impl FieldElement {
+            /// Default degree of parallelism, i.e. how many powers of `H` to compute.
+            pub const DEFAULT_PARALLELISM: usize = 8;
+
+            /// Process an individual block.
+            pub(crate) fn proc_block(
+                h: FieldElement,
+                y: FieldElement,
+                x: &Block,
+                _has_intrinsics: InitToken
+            ) -> FieldElement {
+                soft::proc_block(h, y, x)
+            }
+
+            /// Process multiple blocks in parallel.
+            // TODO(tarcieri): currently just calls `proc_block` for each block on `soft`-only
+            pub(crate) fn proc_par_blocks<const N: usize, U: ArraySize>(
+                powers_of_h: &[FieldElement; N],
+                y: FieldElement,
+                blocks: &Array<Block, U>,
+                _has_intrinsics: InitToken
+            ) -> FieldElement {
+                soft::proc_par_blocks(powers_of_h, y, blocks)
+            }
+        }
+    }
+}
+
 impl FieldElement {
     /// Compute the first N powers of h, in reverse order.
     #[inline]
     #[allow(dead_code)] // We may not use this in some configurations
-    fn powers_of_h<const N: usize>(self) -> [Self; N] {
+    pub(crate) fn powers_of_h<const N: usize>(self) -> [Self; N] {
         // TODO: improve pipelining by using more square operations?
         let mut pow = [Self::default(); N];
         let mut prev = self;
@@ -144,8 +179,7 @@ impl Mul for FieldElement {
 
     #[inline]
     fn mul(self, rhs: Self) -> Self {
-        let v = soft::karatsuba(self, rhs);
-        soft::mont_reduce(v)
+        soft::polymul(self, rhs)
     }
 }
 

diff --git a/polyval/src/field_element/armv8.rs b/polyval/src/field_element/armv8.rs
@@ -13,154 +13,96 @@
 //! For more information about PMULL, see:
 //! - <https://developer.arm.com/documentation/100069/0608/A64-SIMD-Vector-Instructions/PMULL--PMULL2--vector->
 //! - <https://eprint.iacr.org/2015/688.pdf>
+
 #![allow(unsafe_op_in_unsafe_fn)]
 
 use super::FieldElement;
-use crate::{Block, Key, Tag};
+use crate::Block;
 use core::{arch::aarch64::*, mem};
-use universal_hash::{
-    KeyInit, ParBlocks, Reset, UhfBackend,
-    array::ArraySize,
-    common::{BlockSizeUser, KeySizeUser, ParBlocksSizeUser},
-    consts::U16,
-    typenum::{Const, ToUInt, U},
-};
+use universal_hash::array::{Array, ArraySize};
 
-/// POLYVAL reduction polynomial (`x^128 + x^127 + x^126 + x^121 + 1`) encoded in little-endian
-/// GF(2)[x] form with reflected reduction terms arising from folding the upper 128-bits of the
-/// product into the lower half during modular reduction.
-const POLY: u128 = (1 << 127) | (1 << 126) | (1 << 121) | (1 << 63) | (1 << 62) | (1 << 57);
+/// 128-bit SIMD register type.
+pub(super) type Simd128 = uint8x16_t;
 
-/// **POLYVAL**: GHASH-like universal hash over GF(2^128).
-///
-/// Parameterized on a constant that determines how many
-/// blocks to process at once: higher numbers use more memory,
-/// and require more time to re-key, but process data significantly
-/// faster.
+/// Perform carryless multiplication of `y` by `h` and return the result.
 ///
-/// (This constant is not used when acceleration is not enabled.)
-#[derive(Clone)]
-pub struct Polyval<const N: usize = 8> {
-    /// Powers of H in descending order.
-    ///
-    /// (H^N, H^(N-1)...H)
-    h: [FieldElement; N],
-    y: FieldElement,
-}
-
-impl<const N: usize> KeySizeUser for Polyval<N> {
-    type KeySize = U16;
-}
-
-impl<const N: usize> Polyval<N> {
-    /// Initialize POLYVAL with the given `H` field element and initial block
-    pub fn new_with_init_block(h: &Key, init_block: u128) -> Self {
-        Self {
-            h: FieldElement::from(h).powers_of_h(),
-            y: init_block.into(),
-        }
-    }
-}
-
-impl<const N: usize> KeyInit for Polyval<N> {
-    /// Initialize POLYVAL with the given `H` field element
-    fn new(h: &Key) -> Self {
-        Self::new_with_init_block(h, 0)
-    }
-}
-
-impl<const N: usize> BlockSizeUser for Polyval<N> {
-    type BlockSize = U16;
+/// # Safety
+/// It is the caller's responsibility to ensure the host CPU is capable of PMULL and NEON
+/// instructions.
+// TODO(tarcieri): investigate ordering optimizations and fusions e.g.`fuse-crypto-eor`
+#[inline]
+#[target_feature(enable = "aes,neon")]
+pub(super) unsafe fn polymul(y: Simd128, h: Simd128) -> Simd128 {
+    let (h, m, l) = karatsuba1(h, y);
+    let (h, l) = karatsuba2(h, m, l);
+    mont_reduce(h, l)
 }
 
-impl<const N: usize> ParBlocksSizeUser for Polyval<N>
-where
-    U<N>: ArraySize,
-    Const<N>: ToUInt,
-{
-    type ParBlocksSize = U<N>;
+/// Process an individual block.
+///
+/// # Safety
+/// It is the caller's responsibility to ensure the host CPU is capable of PMULL and NEON
+/// instructions.
+#[inline]
+#[target_feature(enable = "aes,neon")]
+pub(super) unsafe fn proc_block(h: FieldElement, y: FieldElement, x: &Block) -> FieldElement {
+    let y = veorq_u8(y.into(), vld1q_u8(x.as_ptr()));
+    polymul(y, h.into()).into()
 }
 
-impl<const N: usize> UhfBackend for Polyval<N>
-where
-    U<N>: ArraySize,
-    Const<N>: ToUInt,
-{
-    fn proc_par_blocks(&mut self, blocks: &ParBlocks<Self>) {
-        unsafe {
-            let mut h = vdupq_n_u8(0);
-            let mut m = vdupq_n_u8(0);
-            let mut l = vdupq_n_u8(0);
-
-            // Note: Manually unrolling this loop did not help in benchmarks.
-            for i in (0..N).rev() {
-                let mut x = vld1q_u8(blocks[i].as_ptr());
-                if i == 0 {
-                    x = veorq_u8(x, self.y.into());
-                }
-                let y = self.h[i];
-                let (hh, mm, ll) = karatsuba1(x, y.into());
-                h = veorq_u8(h, hh);
-                m = veorq_u8(m, mm);
-                l = veorq_u8(l, ll);
+/// Process multiple blocks in parallel.
+///
+/// # Safety
+/// It is the caller's responsibility to ensure the host CPU is capable of PMULL and NEON
+/// instructions.
+#[target_feature(enable = "aes,neon")]
+pub(super) unsafe fn proc_par_blocks<const N: usize, U: ArraySize>(
+    powers_of_h: &[FieldElement; N],
+    y: FieldElement,
+    blocks: &Array<Block, U>,
+) -> FieldElement {
+    unsafe {
+        let mut h = vdupq_n_u8(0);
+        let mut m = vdupq_n_u8(0);
+        let mut l = vdupq_n_u8(0);
+
+        // Note: Manually unrolling this loop did not help in benchmarks.
+        for i in (0..N).rev() {
+            let mut x = vld1q_u8(blocks[i].as_ptr());
+            if i == 0 {
+                x = veorq_u8(x, y.into());
             }
-
-            let (h, l) = karatsuba2(h, m, l);
-            self.y = mont_reduce(h, l).into();
+            let (hh, mm, ll) = karatsuba1(x, powers_of_h[i].into());
+            h = veorq_u8(h, hh);
+            m = veorq_u8(m, mm);
+            l = veorq_u8(l, ll);
         }
-    }
-
-    fn proc_block(&mut self, x: &Block) {
-        unsafe {
-            let y = veorq_u8(self.y.into(), vld1q_u8(x.as_ptr()));
-            self.y = polymul(y, self.h[N - 1].into()).into();
-        }
-    }
-}
 
-impl<const N: usize> Reset for Polyval<N> {
-    fn reset(&mut self) {
-        self.y = FieldElement::default();
+        let (h, l) = karatsuba2(h, m, l);
+        mont_reduce(h, l).into()
     }
 }
 
-impl<const N: usize> Polyval<N> {
-    /// Get POLYVAL output.
-    pub(crate) fn finalize(self) -> Tag {
-        self.y.into()
-    }
-}
-
-impl From<FieldElement> for uint8x16_t {
+impl From<FieldElement> for Simd128 {
     #[inline]
-    fn from(fe: FieldElement) -> uint8x16_t {
+    fn from(fe: FieldElement) -> Simd128 {
         unsafe { vld1q_u8(fe.0.as_ptr()) }
     }
 }
 
-impl From<uint8x16_t> for FieldElement {
+impl From<Simd128> for FieldElement {
     #[inline]
-    fn from(fe: uint8x16_t) -> FieldElement {
+    fn from(fe: Simd128) -> FieldElement {
         let mut ret = FieldElement::default();
         unsafe { vst1q_u8(ret.0.as_mut_ptr(), fe) }
         ret
     }
 }
 
-/// Multiply "y" by "h" and return the result.
-// TODO(tarcieri): investigate ordering optimizations and fusions e.g.`fuse-crypto-eor`
-#[inline]
-#[target_feature(enable = "neon")]
-unsafe fn polymul(y: uint8x16_t, h: uint8x16_t) -> uint8x16_t {
-    let (h, m, l) = karatsuba1(h, y);
-    let (h, l) = karatsuba2(h, m, l);
-    mont_reduce(h, l)
-}
-
 /// Karatsuba decomposition for `x*y`.
 #[inline]
-#[target_feature(enable = "neon")]
-unsafe fn karatsuba1(x: uint8x16_t, y: uint8x16_t) -> (uint8x16_t, uint8x16_t, uint8x16_t) {
+#[target_feature(enable = "aes,neon")]
+unsafe fn karatsuba1(x: Simd128, y: Simd128) -> (Simd128, Simd128, Simd128) {
     // First Karatsuba step: decompose x and y.
     //
     // (x1*y0 + x0*y1) = (x1+x0) * (y1+y0) + (x1*y1) + (x0*y0)
@@ -179,7 +121,7 @@ unsafe fn karatsuba1(x: uint8x16_t, y: uint8x16_t) -> (uint8x16_t, uint8x16_t, u
 /// Karatsuba combine.
 #[inline]
 #[target_feature(enable = "neon")]
-unsafe fn karatsuba2(h: uint8x16_t, m: uint8x16_t, l: uint8x16_t) -> (uint8x16_t, uint8x16_t) {
+unsafe fn karatsuba2(h: Simd128, m: Simd128, l: Simd128) -> (Simd128, Simd128) {
     // Second Karatsuba step: combine into a 2n-bit product.
     //
     // m0 ^= l0 ^ h0 // = m0^(l0^h0)
@@ -218,9 +160,14 @@ unsafe fn karatsuba2(h: uint8x16_t, m: uint8x16_t, l: uint8x16_t) -> (uint8x16_t
     (x23, x01)
 }
 
+/// POLYVAL reduction polynomial (`x^128 + x^127 + x^126 + x^121 + 1`) encoded in little-endian
+/// GF(2)[x] form with reflected reduction terms arising from folding the upper 128-bits of the
+/// product into the lower half during modular reduction.
+const POLY: u128 = (1 << 127) | (1 << 126) | (1 << 121) | (1 << 63) | (1 << 62) | (1 << 57);
+
 #[inline]
-#[target_feature(enable = "neon")]
-unsafe fn mont_reduce(x23: uint8x16_t, x01: uint8x16_t) -> uint8x16_t {
+#[target_feature(enable = "aes,neon")]
+unsafe fn mont_reduce(x23: Simd128, x01: Simd128) -> Simd128 {
     // Perform the Montgomery reduction over the 256-bit X.
     //    [A1:A0] = X0 • poly
     //    [B1:B0] = [X0 ⊕ A1 : X1 ⊕ A0]
@@ -236,8 +183,8 @@ unsafe fn mont_reduce(x23: uint8x16_t, x01: uint8x16_t) -> uint8x16_t {
 
 /// Multiplies the low bits in `a` and `b`.
 #[inline]
-#[target_feature(enable = "neon")]
-unsafe fn pmull(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+#[target_feature(enable = "aes,neon")]
+unsafe fn pmull(a: Simd128, b: Simd128) -> Simd128 {
     mem::transmute(vmull_p64(
         vgetq_lane_u64(vreinterpretq_u64_u8(a), 0),
         vgetq_lane_u64(vreinterpretq_u64_u8(b), 0),
@@ -246,19 +193,10 @@ unsafe fn pmull(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
 
 /// Multiplies the high bits in `a` and `b`.
 #[inline]
-#[target_feature(enable = "neon")]
-unsafe fn pmull2(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+#[target_feature(enable = "aes,neon")]
+unsafe fn pmull2(a: Simd128, b: Simd128) -> Simd128 {
     mem::transmute(vmull_p64(
         vgetq_lane_u64(vreinterpretq_u64_u8(a), 1),
         vgetq_lane_u64(vreinterpretq_u64_u8(b), 1),
     ))
 }
-// TODO(tarcieri): zeroize support
-// #[cfg(feature = "zeroize")]
-// impl Drop for Polyval<N> {
-//     fn drop(&mut self) {
-//         use zeroize::Zeroize;
-//         self.h.zeroize();
-//         self.y.zeroize();
-//     }
-// }