diff --git a/polyval/benches/polyval.rs b/polyval/benches/polyval.rs
index 6d613e4..bea9fc9 100644
--- a/polyval/benches/polyval.rs
+++ b/polyval/benches/polyval.rs
@@ -4,10 +4,7 @@
 
 extern crate test;
 
-use polyval::{
-    Polyval,
-    universal_hash::{KeyInit, UniversalHash},
-};
+use polyval::{Polyval, universal_hash::UniversalHash};
 use test::Bencher;
 
 // TODO(tarcieri): move this into the `universal-hash` crate
diff --git a/polyval/src/field_element.rs b/polyval/src/field_element.rs
index b7ce940..a42913f 100644
--- a/polyval/src/field_element.rs
+++ b/polyval/src/field_element.rs
@@ -3,31 +3,15 @@
 mod soft;
 
 use crate::{BLOCK_SIZE, Block};
-use core::fmt;
-use core::fmt::Debug;
-use core::ops::{Add, Mul, MulAssign};
+use core::{
+    fmt::{self, Debug},
+    ops::{Add, Mul, MulAssign},
+};
 use cpubits::cfg_if;
 
 #[cfg(feature = "zeroize")]
 use zeroize::Zeroize;
 
-cfg_if! {
-    if #[cfg(all(target_arch = "aarch64", not(polyval_backend = "soft")))] {
-        mod autodetect;
-        mod armv8;
-        pub use autodetect::Polyval as PolyvalGeneric;
-    } else if #[cfg(all(
-        any(target_arch = "x86_64", target_arch = "x86"),
-        not(polyval_backend = "soft")
-    ))] {
-        mod autodetect;
-        mod x86;
-        pub use autodetect::Polyval as PolyvalGeneric;
-    } else {
-        pub use soft::Polyval as PolyvalGeneric;
-    }
-}
-
 /// An element in POLYVAL's field.
 ///
 /// This type represents an element of the binary field GF(2^128) modulo the irreducible polynomial
@@ -48,11 +32,62 @@ cfg_if! {
 #[repr(C, align(16))] // Make ABI and alignment compatible with SIMD registers
 pub(crate) struct FieldElement([u8; BLOCK_SIZE]);
 
+cfg_if! {
+    if #[cfg(all(target_arch = "aarch64", not(polyval_backend = "soft")))] {
+        // aarch64
+        mod autodetect;
+        mod armv8;
+        pub(crate) use autodetect::{InitToken, detect_intrinsics};
+    } else if #[cfg(all(
+        any(target_arch = "x86_64", target_arch = "x86"),
+        not(polyval_backend = "soft")
+    ))] {
+        // x86/x86-64
+        mod autodetect;
+        mod x86;
+        pub(crate) use autodetect::{InitToken, detect_intrinsics};
+    } else {
+        // Pure Rust fallback implementation for other targets
+        use universal_hash::array::{Array, ArraySize};
+
+        pub(crate) type InitToken = ();
+        pub(crate) fn detect_intrinsics() -> (InitToken, bool) {
+            ((), false)
+        }
+
+        impl FieldElement {
+            /// Default degree of parallelism, i.e. how many powers of `H` to compute.
+            pub const DEFAULT_PARALLELISM: usize = 8;
+
+            /// Process an individual block.
+            pub(crate) fn proc_block(
+                h: FieldElement,
+                y: FieldElement,
+                x: &Block,
+                _has_intrinsics: InitToken
+            ) -> FieldElement {
+                soft::proc_block(h, y, x)
+            }
+
+            /// Process multiple blocks in parallel.
+            // TODO(tarcieri): currently just calls `proc_block` for each block on `soft`-only
+            pub(crate) fn proc_par_blocks<const N: usize, U: ArraySize>(
+                powers_of_h: &[FieldElement; N],
+                y: FieldElement,
+                blocks: &Array<Block, U>,
+                _has_intrinsics: InitToken
+            ) -> FieldElement {
+                soft::proc_par_blocks(powers_of_h, y, blocks)
+            }
+        }
+    }
+}
+
 impl FieldElement {
     /// Compute the first N powers of h, in reverse order.
     #[inline]
     #[allow(dead_code)] // We may not use this in some configurations
-    fn powers_of_h<const N: usize>(self) -> [Self; N] {
+    pub(crate) fn powers_of_h<const N: usize>(self) -> [Self; N] {
         // TODO: improve pipelining by using more square operations?
         let mut pow = [Self::default(); N];
         let mut prev = self;
@@ -144,8 +179,7 @@ impl Mul for FieldElement {
 
     #[inline]
     fn mul(self, rhs: Self) -> Self {
-        let v = soft::karatsuba(self, rhs);
-        soft::mont_reduce(v)
+        soft::polymul(self, rhs)
     }
 }
 
diff --git a/polyval/src/field_element/armv8.rs b/polyval/src/field_element/armv8.rs
index 95f4640..658d7cd 100644
--- a/polyval/src/field_element/armv8.rs
+++ b/polyval/src/field_element/armv8.rs
@@ -13,154 +13,96 @@
 //! For more information about PMULL, see:
 //! - <https://developer.arm.com/documentation/100069/0608/A64-SIMD-Vector-Instructions/PMULL--PMULL2--vector->
 //! - <https://eprint.iacr.org/2015/688.pdf>
+
 #![allow(unsafe_op_in_unsafe_fn)]
 
 use super::FieldElement;
-use crate::{Block, Key, Tag};
+use crate::Block;
 use core::{arch::aarch64::*, mem};
-use universal_hash::{
-    KeyInit, ParBlocks, Reset, UhfBackend,
-    array::ArraySize,
-    common::{BlockSizeUser, KeySizeUser, ParBlocksSizeUser},
-    consts::U16,
-    typenum::{Const, ToUInt, U},
-};
+use universal_hash::array::{Array, ArraySize};
 
-/// POLYVAL reduction polynomial (`x^128 + x^127 + x^126 + x^121 + 1`) encoded in little-endian
-/// GF(2)[x] form with reflected reduction terms arising from folding the upper 128-bits of the
-/// product into the lower half during modular reduction.
-const POLY: u128 = (1 << 127) | (1 << 126) | (1 << 121) | (1 << 63) | (1 << 62) | (1 << 57);
+/// 128-bit SIMD register type.
+pub(super) type Simd128 = uint8x16_t;
 
-/// **POLYVAL**: GHASH-like universal hash over GF(2^128).
-///
-/// Parameterized on a constant that determines how many
-/// blocks to process at once: higher numbers use more memory,
-/// and require more time to re-key, but process data significantly
-/// faster.
+/// Perform carryless multiplication of `y` by `h` and return the result.
 ///
-/// (This constant is not used when acceleration is not enabled.)
-#[derive(Clone)]
-pub struct Polyval<const N: usize = 8> {
-    /// Powers of H in descending order.
-    ///
-    /// (H^N, H^(N-1)...H)
-    h: [FieldElement; N],
-    y: FieldElement,
-}
-
-impl<const N: usize> KeySizeUser for Polyval<N> {
-    type KeySize = U16;
-}
-
-impl<const N: usize> Polyval<N> {
-    /// Initialize POLYVAL with the given `H` field element and initial block
-    pub fn new_with_init_block(h: &Key, init_block: u128) -> Self {
-        Self {
-            h: FieldElement::from(h).powers_of_h(),
-            y: init_block.into(),
-        }
-    }
-}
-
-impl<const N: usize> KeyInit for Polyval<N> {
-    /// Initialize POLYVAL with the given `H` field element
-    fn new(h: &Key) -> Self {
-        Self::new_with_init_block(h, 0)
-    }
-}
-
-impl<const N: usize> BlockSizeUser for Polyval<N> {
-    type BlockSize = U16;
+/// # Safety
+/// It is the caller's responsibility to ensure the host CPU is capable of PMULL and NEON
+/// instructions.
+// TODO(tarcieri): investigate ordering optimizations and fusions e.g.`fuse-crypto-eor`
+#[inline]
+#[target_feature(enable = "aes,neon")]
+pub(super) unsafe fn polymul(y: Simd128, h: Simd128) -> Simd128 {
+    let (h, m, l) = karatsuba1(h, y);
+    let (h, l) = karatsuba2(h, m, l);
+    mont_reduce(h, l)
 }
 
-impl<const N: usize> ParBlocksSizeUser for Polyval<N>
-where
-    U<N>: ArraySize,
-    Const<N>: ToUInt,
-{
-    type ParBlocksSize = U<N>;
+/// Process an individual block.
+///
+/// # Safety
+/// It is the caller's responsibility to ensure the host CPU is capable of PMULL and NEON
+/// instructions.
+#[inline]
+#[target_feature(enable = "aes,neon")]
+pub(super) unsafe fn proc_block(h: FieldElement, y: FieldElement, x: &Block) -> FieldElement {
+    let y = veorq_u8(y.into(), vld1q_u8(x.as_ptr()));
+    polymul(y, h.into()).into()
 }
 
-impl<const N: usize> UhfBackend for Polyval<N>
-where
-    U<N>: ArraySize,
-    Const<N>: ToUInt,
-{
-    fn proc_par_blocks(&mut self, blocks: &ParBlocks<Self>) {
-        unsafe {
-            let mut h = vdupq_n_u8(0);
-            let mut m = vdupq_n_u8(0);
-            let mut l = vdupq_n_u8(0);
-
-            // Note: Manually unrolling this loop did not help in benchmarks.
-            for i in (0..N).rev() {
-                let mut x = vld1q_u8(blocks[i].as_ptr());
-                if i == 0 {
-                    x = veorq_u8(x, self.y.into());
-                }
-                let y = self.h[i];
-                let (hh, mm, ll) = karatsuba1(x, y.into());
-                h = veorq_u8(h, hh);
-                m = veorq_u8(m, mm);
-                l = veorq_u8(l, ll);
+/// Process multiple blocks in parallel.
+///
+/// # Safety
+/// It is the caller's responsibility to ensure the host CPU is capable of PMULL and NEON
+/// instructions.
+#[target_feature(enable = "aes,neon")]
+pub(super) unsafe fn proc_par_blocks<const N: usize, U: ArraySize>(
+    powers_of_h: &[FieldElement; N],
+    y: FieldElement,
+    blocks: &Array<Block, U>,
+) -> FieldElement {
+    unsafe {
+        let mut h = vdupq_n_u8(0);
+        let mut m = vdupq_n_u8(0);
+        let mut l = vdupq_n_u8(0);
+
+        // Note: Manually unrolling this loop did not help in benchmarks.
+        for i in (0..N).rev() {
+            let mut x = vld1q_u8(blocks[i].as_ptr());
+            if i == 0 {
+                x = veorq_u8(x, y.into());
             }
-
-            let (h, l) = karatsuba2(h, m, l);
-            self.y = mont_reduce(h, l).into();
+            let (hh, mm, ll) = karatsuba1(x, powers_of_h[i].into());
+            h = veorq_u8(h, hh);
+            m = veorq_u8(m, mm);
+            l = veorq_u8(l, ll);
         }
-    }
-
-    fn proc_block(&mut self, x: &Block) {
-        unsafe {
-            let y = veorq_u8(self.y.into(), vld1q_u8(x.as_ptr()));
-            self.y = polymul(y, self.h[N - 1].into()).into();
-        }
-    }
-}
 
-impl<const N: usize> Reset for Polyval<N> {
-    fn reset(&mut self) {
-        self.y = FieldElement::default();
+        let (h, l) = karatsuba2(h, m, l);
+        mont_reduce(h, l).into()
     }
 }
 
-impl<const N: usize> Polyval<N> {
-    /// Get POLYVAL output.
-    pub(crate) fn finalize(self) -> Tag {
-        self.y.into()
-    }
-}
-
-impl From<FieldElement> for uint8x16_t {
+impl From<FieldElement> for Simd128 {
     #[inline]
-    fn from(fe: FieldElement) -> uint8x16_t {
+    fn from(fe: FieldElement) -> Simd128 {
         unsafe { vld1q_u8(fe.0.as_ptr()) }
     }
 }
 
-impl From<uint8x16_t> for FieldElement {
+impl From<Simd128> for FieldElement {
     #[inline]
-    fn from(fe: uint8x16_t) -> FieldElement {
+    fn from(fe: Simd128) -> FieldElement {
         let mut ret = FieldElement::default();
         unsafe { vst1q_u8(ret.0.as_mut_ptr(), fe) }
         ret
     }
 }
 
-/// Multiply "y" by "h" and return the result.
-// TODO(tarcieri): investigate ordering optimizations and fusions e.g.`fuse-crypto-eor`
-#[inline]
-#[target_feature(enable = "neon")]
-unsafe fn polymul(y: uint8x16_t, h: uint8x16_t) -> uint8x16_t {
-    let (h, m, l) = karatsuba1(h, y);
-    let (h, l) = karatsuba2(h, m, l);
-    mont_reduce(h, l)
-}
-
 /// Karatsuba decomposition for `x*y`.
 #[inline]
-#[target_feature(enable = "neon")]
-unsafe fn karatsuba1(x: uint8x16_t, y: uint8x16_t) -> (uint8x16_t, uint8x16_t, uint8x16_t) {
+#[target_feature(enable = "aes,neon")]
+unsafe fn karatsuba1(x: Simd128, y: Simd128) -> (Simd128, Simd128, Simd128) {
     // First Karatsuba step: decompose x and y.
     //
     // (x1*y0 + x0*y1) = (x1+x0) * (y1+y0) + (x1*y1) + (x0*y0)
@@ -179,7 +121,7 @@ unsafe fn karatsuba1(x: uint8x16_t, y: uint8x16_t) -> (uint8x16_t, uint8x16_t, u
 /// Karatsuba combine.
 #[inline]
 #[target_feature(enable = "neon")]
-unsafe fn karatsuba2(h: uint8x16_t, m: uint8x16_t, l: uint8x16_t) -> (uint8x16_t, uint8x16_t) {
+unsafe fn karatsuba2(h: Simd128, m: Simd128, l: Simd128) -> (Simd128, Simd128) {
     // Second Karatsuba step: combine into a 2n-bit product.
     //
     // m0 ^= l0 ^ h0 // = m0^(l0^h0)
@@ -218,9 +160,14 @@ unsafe fn karatsuba2(h: uint8x16_t, m: uint8x16_t, l: uint8x16_t) -> (uint8x16_t
     (x23, x01)
 }
 
+/// POLYVAL reduction polynomial (`x^128 + x^127 + x^126 + x^121 + 1`) encoded in little-endian
+/// GF(2)[x] form with reflected reduction terms arising from folding the upper 128-bits of the
+/// product into the lower half during modular reduction.
+const POLY: u128 = (1 << 127) | (1 << 126) | (1 << 121) | (1 << 63) | (1 << 62) | (1 << 57);
+
 #[inline]
-#[target_feature(enable = "neon")]
-unsafe fn mont_reduce(x23: uint8x16_t, x01: uint8x16_t) -> uint8x16_t {
+#[target_feature(enable = "aes,neon")]
+unsafe fn mont_reduce(x23: Simd128, x01: Simd128) -> Simd128 {
     // Perform the Montgomery reduction over the 256-bit X.
     //    [A1:A0] = X0 • poly
     //    [B1:B0] = [X0 ⊕ A1 : X1 ⊕ A0]
@@ -236,8 +183,8 @@ unsafe fn mont_reduce(x23: uint8x16_t, x01: uint8x16_t) -> uint8x16_t {
 
 /// Multiplies the low bits in `a` and `b`.
 #[inline]
-#[target_feature(enable = "neon")]
-unsafe fn pmull(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+#[target_feature(enable = "aes,neon")]
+unsafe fn pmull(a: Simd128, b: Simd128) -> Simd128 {
     mem::transmute(vmull_p64(
         vgetq_lane_u64(vreinterpretq_u64_u8(a), 0),
         vgetq_lane_u64(vreinterpretq_u64_u8(b), 0),
@@ -246,19 +193,10 @@ unsafe fn pmull(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
 
 /// Multiplies the high bits in `a` and `b`.
 #[inline]
-#[target_feature(enable = "neon")]
-unsafe fn pmull2(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+#[target_feature(enable = "aes,neon")]
+unsafe fn pmull2(a: Simd128, b: Simd128) -> Simd128 {
     mem::transmute(vmull_p64(
         vgetq_lane_u64(vreinterpretq_u64_u8(a), 1),
         vgetq_lane_u64(vreinterpretq_u64_u8(b), 1),
     ))
 }
-// TODO(tarcieri): zeroize support
-// #[cfg(feature = "zeroize")]
-// impl Drop for Polyval<N> {
-//     fn drop(&mut self) {
-//         use zeroize::Zeroize;
-//         self.h.zeroize();
-//         self.y.zeroize();
-//     }
-// }
diff --git a/polyval/src/field_element/autodetect.rs b/polyval/src/field_element/autodetect.rs
index 6df8deb..e6721f0 100644
--- a/polyval/src/field_element/autodetect.rs
+++ b/polyval/src/field_element/autodetect.rs
@@ -1,135 +1,54 @@
 //! Autodetection for CPU intrinsics, with fallback to the "soft" backend when
 //! they are unavailable.
 
-use crate::{Key, Tag, field_element::soft};
-use core::mem::ManuallyDrop;
-use universal_hash::{
-    KeyInit, Reset, UhfClosure, UniversalHash,
-    array::ArraySize,
-    common::{BlockSizeUser, KeySizeUser},
-    consts::U16,
-    typenum::{Const, ToUInt, U},
-};
-
 #[cfg(target_arch = "aarch64")]
 use super::armv8 as intrinsics;
-
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use super::x86 as intrinsics;
 
+use super::{FieldElement, soft};
+use crate::Block;
+use universal_hash::array::{Array, ArraySize};
+
 #[cfg(target_arch = "aarch64")]
 cpufeatures::new!(mul_intrinsics, "aes"); // `aes` implies PMULL
-
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 cpufeatures::new!(mul_intrinsics, "pclmulqdq");
 
-/// **POLYVAL**: GHASH-like universal hash over GF(2^128).
-///
-/// Parameterized on a constant that determines how many
-/// blocks to process at once: higher numbers use more memory,
-/// and require more time to re-key, but process data significantly
-/// faster.
-///
-/// (This constant is not used when acceleration is not enabled.)
-pub struct Polyval<const N: usize = 8> {
-    inner: Inner<N>,
-    token: mul_intrinsics::InitToken,
-}
-
-union Inner<const N: usize> {
-    intrinsics: ManuallyDrop<intrinsics::Polyval<N>>,
-    soft: ManuallyDrop<soft::Polyval<N>>,
-}
-
-impl<const N: usize> KeySizeUser for Polyval<N> {
-    type KeySize = U16;
-}
-
-impl<const N: usize> Polyval<N> {
-    /// Initialize POLYVAL with the given `H` field element and initial block
-    #[must_use]
-    pub fn new_with_init_block(h: &Key, init_block: u128) -> Self {
-        let (token, has_intrinsics) = mul_intrinsics::init_get();
-
-        let inner = if has_intrinsics {
-            Inner {
-                intrinsics: ManuallyDrop::new(intrinsics::Polyval::new_with_init_block(
-                    h, init_block,
-                )),
-            }
-        } else {
-            Inner {
-                soft: ManuallyDrop::new(soft::Polyval::new_with_init_block(h, init_block)),
-            }
-        };
-
-        Self { inner, token }
-    }
-}
-
-impl<const N: usize> KeyInit for Polyval<N> {
-    /// Initialize POLYVAL with the given `H` field element
-    fn new(h: &Key) -> Self {
-        Self::new_with_init_block(h, 0)
-    }
-}
-
-impl<const N: usize> BlockSizeUser for Polyval<N> {
-    type BlockSize = U16;
-}
-
-impl<const N: usize> UniversalHash for Polyval<N>
-where
-    U<N>: ArraySize,
-    Const<N>: ToUInt,
-{
-    fn update_with_backend(&mut self, f: impl UhfClosure<BlockSize = Self::BlockSize>) {
-        unsafe {
-            if self.token.get() {
-                f.call(&mut *self.inner.intrinsics);
-            } else {
-                f.call(&mut *self.inner.soft);
-            }
-        }
-    }
-
-    /// Get POLYVAL result (i.e. computed `S` field element)
-    fn finalize(self) -> Tag {
-        unsafe {
-            if self.token.get() {
-                ManuallyDrop::into_inner(self.inner.intrinsics).finalize()
-            } else {
-                ManuallyDrop::into_inner(self.inner.soft).finalize()
-            }
-        }
-    }
-}
-
-impl<const N: usize> Clone for Polyval<N> {
-    fn clone(&self) -> Self {
-        let inner = if self.token.get() {
-            Inner {
-                intrinsics: ManuallyDrop::new(unsafe { (*self.inner.intrinsics).clone() }),
-            }
+pub(crate) use mul_intrinsics::{InitToken, init_get as detect_intrinsics};
+
+impl FieldElement {
+    /// Default degree of parallelism, i.e. how many powers of `H` to compute.
+    pub const DEFAULT_PARALLELISM: usize = 8;
+
+    /// Process an individual block.
+    pub(crate) fn proc_block(
+        h: FieldElement,
+        y: FieldElement,
+        block: &Block,
+        has_intrinsics: InitToken,
+    ) -> FieldElement {
+        if has_intrinsics.get() {
+            // SAFETY: we have checked the CPU has the necessary intrinsics above
+            unsafe { intrinsics::proc_block(h, y, block) }
         } else {
-            Inner {
-                soft: ManuallyDrop::new(unsafe { (*self.inner.soft).clone() }),
-            }
-        };
-
-        Self {
-            inner,
-            token: self.token,
+            soft::proc_block(h, y, block)
         }
     }
-}
 
-impl<const N: usize> Reset for Polyval<N> {
-    fn reset(&mut self) {
-        if self.token.get() {
-            unsafe { (*self.inner.intrinsics).reset() }
+    /// Process multiple blocks in parallel.
+    pub(crate) fn proc_par_blocks<const N: usize, U: ArraySize>(
+        powers_of_h: &[FieldElement; N],
+        y: FieldElement,
+        blocks: &Array<Block, U>,
+        has_intrinsics: InitToken,
+    ) -> FieldElement {
+        if has_intrinsics.get() {
+            // SAFETY: we have checked the CPU has the necessary intrinsics above
+            unsafe { intrinsics::proc_par_blocks(powers_of_h, y, blocks) }
         } else {
-            unsafe { (*self.inner.soft).reset() }
+            // TODO(tarcieri): currently just calls `proc_block` for each block on `soft`-only
+            soft::proc_par_blocks(powers_of_h, y, blocks)
         }
     }
 }
diff --git a/polyval/src/field_element/soft.rs b/polyval/src/field_element/soft.rs
index 7daa031..ec76d3c 100644
--- a/polyval/src/field_element/soft.rs
+++ b/polyval/src/field_element/soft.rs
@@ -28,99 +28,40 @@ cpubits::cpubits! {
     }
 }
 
-pub(super) use soft_impl::{karatsuba, mont_reduce};
-
-use super::FieldElement;
-use crate::{Block, Key, Tag};
+use crate::Block;
+use crate::field_element::FieldElement;
 use core::{
     num::Wrapping,
     ops::{BitAnd, BitOr, BitXor, Mul, Shl},
 };
-use universal_hash::{
-    KeyInit, Reset, UhfBackend, UhfClosure, UniversalHash,
-    common::{BlockSizeUser, KeySizeUser, ParBlocksSizeUser},
-    consts::{U1, U16},
-};
-
-#[cfg(feature = "zeroize")]
-use zeroize::Zeroize;
-
-/// **POLYVAL**: GHASH-like universal hash over GF(2^128).
-///
-/// Parameterized on a constant that determines how many
-/// blocks to process at once: higher numbers use more memory,
-/// and require more time to re-key, but process data significantly
-/// faster.
-///
-/// (This constant is not used when acceleration is not enabled.)
-#[derive(Clone)]
-pub struct Polyval<const N: usize = 1> {
-    /// GF(2^128) field element input blocks are multiplied by
-    h: FieldElement,
-
-    /// Field element representing the computed universal hash
-    y: FieldElement,
-}
-
-impl<const N: usize> Polyval<N> {
-    /// Initialize POLYVAL with the given `H` field element and initial block
-    pub fn new_with_init_block(h: &Key, init_block: u128) -> Self {
-        Self {
-            h: FieldElement::from(*h),
-            y: init_block.into(),
-        }
-    }
-}
-
-impl<const N: usize> KeySizeUser for Polyval<N> {
-    type KeySize = U16;
-}
-
-impl<const N: usize> KeyInit for Polyval<N> {
-    /// Initialize POLYVAL with the given `H` field element
-    fn new(h: &Key) -> Self {
-        Self::new_with_init_block(h, 0)
-    }
-}
-
-impl<const N: usize> BlockSizeUser for Polyval<N> {
-    type BlockSize = U16;
-}
-
-impl<const N: usize> ParBlocksSizeUser for Polyval<N> {
-    type ParBlocksSize = U1;
-}
-
-impl<const N: usize> UhfBackend for Polyval<N> {
-    fn proc_block(&mut self, x: &Block) {
-        let x = FieldElement::from(x);
-        self.y = (self.y + x) * self.h;
-    }
-}
-
-impl<const N: usize> UniversalHash for Polyval<N> {
-    fn update_with_backend(&mut self, f: impl UhfClosure<BlockSize = Self::BlockSize>) {
-        f.call(self);
-    }
+use soft_impl::{karatsuba, mont_reduce};
+use universal_hash::array::{Array, ArraySize};
 
-    /// Get POLYVAL result (i.e. computed `S` field element)
-    fn finalize(self) -> Tag {
-        self.y.into()
-    }
+/// Perform carryless multiplication of `y` by `h` and return the result.
+#[inline]
+pub(super) fn polymul(y: FieldElement, h: FieldElement) -> FieldElement {
+    let v = karatsuba(y, h);
+    mont_reduce(v)
 }
 
-impl<const N: usize> Reset for Polyval<N> {
-    fn reset(&mut self) {
-        self.y = FieldElement::default();
-    }
+/// Process an individual block.
+// TODO(tarcieri): implement `proc_par_blocks` for soft backend?
+pub(super) fn proc_block(h: FieldElement, y: FieldElement, x: &Block) -> FieldElement {
+    let x = FieldElement::from(x);
+    polymul(y + x, h)
 }
 
-#[cfg(feature = "zeroize")]
-impl<const N: usize> Drop for Polyval<N> {
-    fn drop(&mut self) {
-        self.h.zeroize();
-        self.y.zeroize();
+/// Process multiple blocks.
+// TODO(tarcieri): optimized implementation?
+pub(super) fn proc_par_blocks<const N: usize, U: ArraySize>(
+    powers_of_h: &[FieldElement; N],
+    mut y: FieldElement,
+    blocks: &Array<Block, U>,
+) -> FieldElement {
+    for block in blocks.iter() {
+        y = proc_block(powers_of_h[N - 1], y, block);
     }
+    y
 }
 
 /// Multiplication in GF(2)[X], implemented generically and wrapped as `bmul32` and `bmul64`.
diff --git a/polyval/src/field_element/soft/soft32.rs b/polyval/src/field_element/soft/soft32.rs
index 5b172ce..073653f 100644
--- a/polyval/src/field_element/soft/soft32.rs
+++ b/polyval/src/field_element/soft/soft32.rs
@@ -56,7 +56,7 @@ impl FieldElement {
 /// multiplications, hence nine 32x32 multiplications. With the bit-reversal trick, we have to
 /// perform 18 32x32 multiplications.
 #[inline]
-pub(crate) fn karatsuba(h: FieldElement, y: FieldElement) -> [u32; 8] {
+pub(super) fn karatsuba(h: FieldElement, y: FieldElement) -> [u32; 8] {
     let hw = h.to_u32x4();
     let yw = y.to_u32x4();
     let hwr = [
@@ -149,7 +149,7 @@ fn bmul32(x: u32, y: u32) -> u32 {
 ///
 /// This is closely related to GHASH reduction but the bit order is reversed in POLYVAL.
 #[inline]
-pub(crate) fn mont_reduce(mut zw: [u32; 8]) -> FieldElement {
+pub(super) fn mont_reduce(mut zw: [u32; 8]) -> FieldElement {
     for i in 0..4 {
         let lw = zw[i];
         zw[i + 4] ^= lw ^ (lw >> 1) ^ (lw >> 2) ^ (lw >> 7);
diff --git a/polyval/src/field_element/x86.rs b/polyval/src/field_element/x86.rs
index 41008df..c20a727 100644
--- a/polyval/src/field_element/x86.rs
+++ b/polyval/src/field_element/x86.rs
@@ -1,8 +1,10 @@
 //! Intel `CLMUL`-accelerated implementation for modern x86/x86_64 CPUs
 //! (i.e. Intel Sandy Bridge-compatible or newer)
 //!
-//! Based on implementation by Eric Lagergren
-//! at <https://github.com/ericlagergren/polyval-rs/>.
+//! Based on implementation by Eric Lagergren at
+//! <https://github.com/ericlagergren/polyval-rs/>.
+
+#![allow(unsafe_op_in_unsafe_fn, unused_unsafe)]
 
 #[cfg(target_arch = "x86")]
 use core::arch::x86::*;
@@ -10,183 +12,85 @@ use core::arch::x86::*;
 use core::arch::x86_64::*;
 
 use super::FieldElement;
-use crate::{Block, Key, Tag};
+use crate::Block;
 use core::ptr;
-use universal_hash::{
-    KeyInit, ParBlocks, Reset, UhfBackend,
-    array::ArraySize,
-    common::{BlockSizeUser, KeySizeUser, ParBlocksSizeUser},
-    consts::U16,
-    typenum::{Const, ToUInt, U},
-};
+use universal_hash::array::{Array, ArraySize};
+
+/// 128-bit SIMD register type.
+pub(super) type Simd128 = __m128i;
 
-/// **POLYVAL**: GHASH-like universal hash over GF(2^128).
+/// Perform carryless multiplication of `y` by `h` and return the result.
 ///
-/// Parameterized on a constant that determines how many
-/// blocks to process at once: higher numbers use more memory,
-/// and require more time to re-key, but process data significantly
-/// faster.
+/// # Safety
 ///
-/// (This constant is not used when acceleration is not enabled.)
-#[derive(Clone)]
-pub struct Polyval<const N: usize = 8> {
-    /// Powers of H in descending order.
-    ///
-    /// (H^N, H^(N-1)...H)
-    h: [FieldElement; N],
-    y: FieldElement,
-}
-
-impl<const N: usize> KeySizeUser for Polyval<N> {
-    type KeySize = U16;
-}
-
-impl<const N: usize> Polyval<N> {
-    /// Initialize POLYVAL with the given `H` field element and initial block
-    pub fn new_with_init_block(h: &Key, init_block: u128) -> Self {
-        Self {
-            h: FieldElement::from(h).powers_of_h(),
-            y: init_block.into(),
-        }
-    }
-}
-
-impl<const N: usize> KeyInit for Polyval<N> {
-    /// Initialize POLYVAL with the given `H` field element
-    fn new(h: &Key) -> Self {
-        Self::new_with_init_block(h, 0)
-    }
-}
-
-impl<const N: usize> BlockSizeUser for Polyval<N> {
-    type BlockSize = U16;
-}
-
-impl<const N: usize> ParBlocksSizeUser for Polyval<N>
-where
-    U<N>: ArraySize,
-    Const<N>: ToUInt,
-{
-    type ParBlocksSize = U<N>;
-}
-
-impl<const N: usize> UhfBackend for Polyval<N>
-where
-    U<N>: ArraySize,
-    Const<N>: ToUInt,
-{
-    fn proc_par_blocks(&mut self, blocks: &ParBlocks<Self>) {
-        unsafe {
-            self.mul_par_blocks(blocks);
-        }
-    }
-
-    fn proc_block(&mut self, x: &Block) {
-        unsafe {
-            self.mul(x);
-        }
+/// The SSE2 and pclmulqdq target features must be enabled.
+#[inline]
+#[target_feature(enable = "pclmulqdq,sse2")]
+pub(super) unsafe fn polymul(x: Simd128, y: Simd128) -> Simd128 {
+    let (h, m, l) = unsafe { karatsuba1(x, y) };
+    let (h, l) = unsafe { karatsuba2(h, m, l) };
+    unsafe {
+        mont_reduce(h, l) // d
     }
 }
 
-impl<const N: usize> Polyval<N> {
-    /// Get Polyval output
-    pub(crate) fn finalize(self) -> Tag {
-        unsafe { core::mem::transmute(self.y) }
-    }
+/// Perform carryless multiplication of `y` by `h` and return the result.
+#[inline]
+#[target_feature(enable = "pclmulqdq")]
+pub(super) unsafe fn proc_block(h: FieldElement, y: FieldElement, x: &Block) -> FieldElement {
+    let x = unsafe { _mm_loadu_si128(x.as_ptr().cast()) };
+    unsafe { polymul(_mm_xor_si128(y.into(), x), h.into()).into() }
 }
 
-impl<const N: usize> Polyval<N>
-where
-    U<N>: ArraySize,
-    Const<N>: ToUInt,
-{
-    #[inline]
-    #[target_feature(enable = "pclmulqdq")]
-    unsafe fn mul_par_blocks(&mut self, blocks: &ParBlocks<Self>) {
-        unsafe {
-            let mut h = _mm_setzero_si128();
-            let mut m = _mm_setzero_si128();
-            let mut l = _mm_setzero_si128();
-
-            // Note: Manually unrolling this loop did not help in benchmarks.
-            for i in (0..N).rev() {
-                let mut x = _mm_loadu_si128(blocks[i].as_ptr().cast());
-                if i == 0 {
-                    x = _mm_xor_si128(x, self.y.into());
-                }
-                let y = self.h[i];
-                let (hh, mm, ll) = karatsuba1(x, y.into());
-                h = _mm_xor_si128(h, hh);
-                m = _mm_xor_si128(m, mm);
-                l = _mm_xor_si128(l, ll);
+#[inline]
+#[target_feature(enable = "pclmulqdq")]
+pub(super) unsafe fn proc_par_blocks<const N: usize, U: ArraySize>(
+    powers_of_h: &[FieldElement; N],
+    y: FieldElement,
+    blocks: &Array<Block, U>,
+) -> FieldElement {
+    unsafe {
+        let mut h = _mm_setzero_si128();
+        let mut m = _mm_setzero_si128();
+        let mut l = _mm_setzero_si128();
+
+        // Note: Manually unrolling this loop did not help in benchmarks.
+        for i in (0..N).rev() {
+            let mut x = _mm_loadu_si128(blocks[i].as_ptr().cast());
+            if i == 0 {
+                x = _mm_xor_si128(x, y.into());
             }
-
-            let (h, l) = karatsuba2(h, m, l);
-            self.y = mont_reduce(h, l).into();
+            let (hh, mm, ll) = karatsuba1(x, powers_of_h[i].into());
+            h = _mm_xor_si128(h, hh);
+            m = _mm_xor_si128(m, mm);
+            l = _mm_xor_si128(l, ll);
         }
-    }
-
-    #[inline]
-    #[target_feature(enable = "pclmulqdq")]
-    #[allow(unsafe_op_in_unsafe_fn)]
-    unsafe fn mul(&mut self, x: &Block) {
-        let x = _mm_loadu_si128(x.as_ptr().cast());
-        self.y = polymul(_mm_xor_si128(self.y.into(), x), self.h[N - 1].into()).into();
-    }
-}
-
-impl<const N: usize> Reset for Polyval<N> {
-    fn reset(&mut self) {
-        self.y = FieldElement::default();
-    }
-}
 
-#[cfg(feature = "zeroize")]
-impl<const N: usize> Drop for Polyval<N> {
-    fn drop(&mut self) {
-        use zeroize::Zeroize;
-        self.h.zeroize();
-        self.y.zeroize();
+        let (h, l) = karatsuba2(h, m, l);
+        mont_reduce(h, l).into()
     }
 }
 
-impl From<FieldElement> for __m128i {
+impl From<FieldElement> for Simd128 {
     #[inline]
-    fn from(fe: FieldElement) -> __m128i {
+    fn from(fe: FieldElement) -> Simd128 {
         unsafe { _mm_loadu_si128(fe.0.as_ptr().cast()) }
     }
 }
 
-impl From<__m128i> for FieldElement {
+impl From<Simd128> for FieldElement {
     #[inline]
-    fn from(fe: __m128i) -> FieldElement {
+    fn from(fe: Simd128) -> FieldElement {
         let mut ret = FieldElement::default();
         unsafe { _mm_store_si128(ret.0.as_mut_ptr().cast(), fe) }
         ret
     }
 }
 
-/// # Safety
-///
-/// The SSE2 and pclmulqdq target features must be enabled.
-#[inline]
-#[target_feature(enable = "sse2,pclmulqdq")]
-#[allow(unused_unsafe)]
-#[allow(clippy::undocumented_unsafe_blocks, reason = "Too many unsafe blocks.")]
-unsafe fn polymul(x: __m128i, y: __m128i) -> __m128i {
-    let (h, m, l) = unsafe { karatsuba1(x, y) };
-    let (h, l) = unsafe { karatsuba2(h, m, l) };
-    unsafe {
-        mont_reduce(h, l) // d
-    }
-}
-
 /// Karatsuba decomposition for `x*y`.
 #[inline]
-#[target_feature(enable = "sse2,pclmulqdq")]
-#[allow(unused_unsafe)]
-#[allow(clippy::undocumented_unsafe_blocks, reason = "Too many unsafe blocks.")]
-unsafe fn karatsuba1(x: __m128i, y: __m128i) -> (__m128i, __m128i, __m128i) {
+#[target_feature(enable = "pclmulqdq,sse2")]
+unsafe fn karatsuba1(x: Simd128, y: Simd128) -> (Simd128, Simd128, Simd128) {
     // First Karatsuba step: decompose x and y.
     //
     // (x1*y0 + x0*y1) = (x1+x0) * (y1+x0) + (x1*y1) + (x0*y0)
@@ -194,22 +98,20 @@ unsafe fn karatsuba1(x: __m128i, y: __m128i) -> (__m128i, __m128i, __m128i) {
     //
     // m = x.hi^x.lo * y.hi^y.lo
     let m = unsafe {
-        pmull(
+        clmul(
             _mm_xor_si128(x, _mm_shuffle_epi32(x, 0xee)),
             _mm_xor_si128(y, _mm_shuffle_epi32(y, 0xee)),
         )
     };
-    let h = unsafe { pmull2(y, x) }; // h = x.hi * y.hi
-    let l = unsafe { pmull(y, x) }; // l = x.lo * y.lo
+    let h = unsafe { clmul2(y, x) }; // h = x.hi * y.hi
+    let l = unsafe { clmul(y, x) }; // l = x.lo * y.lo
     (h, m, l)
 }
 
 /// Karatsuba combine.
 #[inline]
-#[target_feature(enable = "sse2,pclmulqdq")]
-#[allow(unused_unsafe)]
-#[allow(clippy::undocumented_unsafe_blocks, reason = "Too many unsafe blocks.")]
-unsafe fn karatsuba2(h: __m128i, m: __m128i, l: __m128i) -> (__m128i, __m128i) {
+#[target_feature(enable = "pclmulqdq,sse2")]
+unsafe fn karatsuba2(h: Simd128, m: Simd128, l: Simd128) -> (Simd128, Simd128) {
     // Second Karatsuba step: combine into a 2n-bit product.
     //
     // m0 ^= l0 ^ h0 // = m0^(l0^h0)
@@ -242,21 +144,20 @@ unsafe fn karatsuba2(h: __m128i, m: __m128i, l: __m128i) -> (__m128i, __m128i) {
 
     // {m0^l1^h0^l0, l0}
     let x01 = unsafe { _mm_unpacklo_epi64(l, t) };
-
     // {h1, m1^h0^h1^l1}
     let x23 = unsafe { _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(h), _mm_castsi128_ps(t))) };
 
     (x23, x01)
 }
 
-/// # Safety
+/// Perform Montgomery reduction of the 256-bit product into 128-bits.
 ///
-/// The SSE2 and pclmulqdq target features must be enavled.
+/// # Safety
+/// It is the caller's responsibility to ensure the host CPU is capable of CLMUL and SSE2
+/// instructions.
 #[inline]
-#[target_feature(enable = "sse2,pclmulqdq")]
-#[allow(unused_unsafe)]
-#[allow(clippy::undocumented_unsafe_blocks, reason = "Too many unsafe blocks.")]
-unsafe fn mont_reduce(x23: __m128i, x01: __m128i) -> __m128i {
+#[target_feature(enable = "pclmulqdq,sse2")]
+unsafe fn mont_reduce(x23: Simd128, x01: Simd128) -> Simd128 {
     // Perform the Montgomery reduction over the 256-bit X.
     //    [A1:A0] = X0 • poly
     //    [B1:B0] = [X0 ⊕ A1 : X1 ⊕ A0]
@@ -265,21 +166,20 @@ unsafe fn mont_reduce(x23: __m128i, x01: __m128i) -> __m128i {
     // Output: [D1 ⊕ X3 : D0 ⊕ X2]
     static POLY: u128 = (1 << 127) | (1 << 126) | (1 << 121) | (1 << 63) | (1 << 62) | (1 << 57);
     let poly = unsafe { _mm_loadu_si128(ptr::addr_of!(POLY).cast()) };
-    let a = unsafe { pmull(x01, poly) };
+    let a = unsafe { clmul(x01, poly) };
     let b = unsafe { _mm_xor_si128(x01, _mm_shuffle_epi32(a, 0x4e)) };
-    let c = unsafe { pmull2(b, poly) };
+    let c = unsafe { clmul2(b, poly) };
     unsafe { _mm_xor_si128(x23, _mm_xor_si128(c, b)) }
 }
 
 /// Multiplies the low bits in `a` and `b`.
 ///
 /// # Safety
-///
-/// The SSE2 and pclmulqdq target features must be enabled.
+/// It is the caller's responsibility to ensure the host CPU is capable of CLMUL and SSE2
+/// instructions.
 #[inline]
-#[allow(unused_unsafe)]
-#[target_feature(enable = "sse2,pclmulqdq")]
-unsafe fn pmull(a: __m128i, b: __m128i) -> __m128i {
+#[target_feature(enable = "pclmulqdq,sse2")]
+unsafe fn clmul(a: Simd128, b: Simd128) -> Simd128 {
     // SAFETY: This requires the `sse2` and `pclmulqdq` features
     // which we have.
     unsafe { _mm_clmulepi64_si128(a, b, 0x00) }
@@ -288,12 +188,11 @@ unsafe fn pmull(a: __m128i, b: __m128i) -> __m128i {
 /// Multiplies the high bits in `a` and `b`.
 ///
 /// # Safety
-///
-/// The SSE2 and pclmulqdq target features must be enavled.
+/// It is the caller's responsibility to ensure the host CPU is capable of CLMUL and SSE2
+/// instructions.
 #[inline]
-#[allow(unused_unsafe)]
-#[target_feature(enable = "sse2,pclmulqdq")]
-unsafe fn pmull2(a: __m128i, b: __m128i) -> __m128i {
+#[target_feature(enable = "pclmulqdq,sse2")]
+unsafe fn clmul2(a: Simd128, b: Simd128) -> Simd128 {
     // SAFETY: This requires the `sse2` and `pclmulqdq` features
     // which we have.
     unsafe { _mm_clmulepi64_si128(a, b, 0x11) }
diff --git a/polyval/src/lib.rs b/polyval/src/lib.rs
index eb4ce04..2a77c23 100644
--- a/polyval/src/lib.rs
+++ b/polyval/src/lib.rs
@@ -9,14 +9,21 @@
 mod field_element;
 mod mulx;
 
-pub use crate::{field_element::PolyvalGeneric, mulx::mulx};
+pub use crate::mulx::mulx;
 pub use universal_hash;
 
-impl<const N: usize> core::fmt::Debug for PolyvalGeneric<N> {
-    fn fmt(&self, f: &mut core::fmt::Formatter) -> Result<(), core::fmt::Error> {
-        write!(f, "PolyvalGeneric<{}> {{ ... }}", N)
-    }
-}
+use core::fmt::{self, Debug};
+use field_element::{FieldElement, InitToken, detect_intrinsics};
+use universal_hash::{
+    KeyInit, ParBlocks, Reset, UhfBackend, UhfClosure, UniversalHash,
+    array::{Array, ArraySize},
+    common::{BlockSizeUser, KeySizeUser, ParBlocksSizeUser},
+    consts::U16,
+    typenum::{Const, ToUInt, U},
+};
+
+#[cfg(feature = "zeroize")]
+use zeroize::Zeroize;
 
 /// Size of a POLYVAL block in bytes
 pub const BLOCK_SIZE: usize = 16;
@@ -25,13 +32,133 @@ pub const BLOCK_SIZE: usize = 16;
 pub const KEY_SIZE: usize = 16;
 
 /// POLYVAL keys (16-bytes)
-pub type Key = universal_hash::Key<Polyval>;
+pub type Key = Array<u8, U16>;
 
 /// POLYVAL blocks (16-bytes)
-pub type Block = universal_hash::Block<Polyval>;
+pub type Block = Array<u8, U16>;
 
 /// POLYVAL tags (16-bytes)
-pub type Tag = universal_hash::Block<Polyval>;
+pub type Tag = Array<u8, U16>;
+
+/// **POLYVAL**: GHASH-like universal hash over GF(2^128).
+///
+/// This type alias uses the default amount of parallelism for the target (`8` for `aarch64`/`x86`,
+/// `1` for other targets using a pure Rust fallback implementation).
+pub type Polyval = PolyvalGeneric<{ FieldElement::DEFAULT_PARALLELISM }>;
 
 /// **POLYVAL**: GHASH-like universal hash over GF(2^128).
-pub type Polyval = PolyvalGeneric<8>;
+///
+/// Parameterized on a constant that determines how many blocks to process at once: higher numbers
+/// use more memory, and require more time to re-key, but process data significantly faster.
+///
+/// (This constant is not used when acceleration is not enabled.)
+#[derive(Clone)]
+pub struct PolyvalGeneric<const N: usize = { FieldElement::DEFAULT_PARALLELISM }> {
+    /// Powers of H in descending order.
+    ///
+    /// (H^N, H^(N-1)...H)
+    powers_of_h: [FieldElement; N],
+
+    /// Accumulator for POLYVAL computation.
+    y: FieldElement,
+
+    /// Token for accessing CPU feature detection results.
+    has_intrinsics: InitToken,
+}
+
+impl<const N: usize> PolyvalGeneric<N> {
+    /// Initialize POLYVAL with the given `H` field element.
+    #[must_use]
+    pub fn new(h: &Key) -> Self {
+        Self::new_with_init_block(h, 0)
+    }
+
+    /// Initialize POLYVAL with the given `H` field element and initial block.
+    #[must_use]
+    pub fn new_with_init_block(h: &Key, init_block: u128) -> Self {
+        let (token, _has_intrinsics) = detect_intrinsics();
+        Self {
+            powers_of_h: FieldElement::from(h).powers_of_h(),
+            y: init_block.into(),
+            has_intrinsics: token,
+        }
+    }
+
+    /// Get `h` from the powers-of-`H`.
+    #[inline]
+    pub(crate) fn h(&self) -> FieldElement {
+        self.powers_of_h[N - 1]
+    }
+}
+
+impl<const N: usize> KeyInit for PolyvalGeneric<N> {
+    fn new(h: &Key) -> Self {
+        Self::new(h)
+    }
+}
+
+impl<const N: usize> KeySizeUser for PolyvalGeneric<N> {
+    type KeySize = U16;
+}
+
+impl<const N: usize> BlockSizeUser for PolyvalGeneric<N> {
+    type BlockSize = U16;
+}
+
+impl<const N: usize> ParBlocksSizeUser for PolyvalGeneric<N>
+where
+    U<N>: ArraySize,
+    Const<N>: ToUInt,
+{
+    type ParBlocksSize = U<N>;
+}
+
+impl<const N: usize> UniversalHash for PolyvalGeneric<N>
+where
+    U<N>: ArraySize,
+    Const<N>: ToUInt,
+{
+    fn update_with_backend(&mut self, f: impl UhfClosure<BlockSize = Self::BlockSize>) {
+        f.call(self);
+    }
+
+    fn finalize(self) -> Tag {
+        self.y.into()
+    }
+}
+
+#[allow(clippy::unit_arg)]
+impl<const N: usize> UhfBackend for PolyvalGeneric<N>
+where
+    U<N>: ArraySize,
+    Const<N>: ToUInt,
+{
+    fn proc_block(&mut self, block: &Block) {
+        self.y = FieldElement::proc_block(self.h(), self.y, block, self.has_intrinsics);
+    }
+
+    fn proc_par_blocks(&mut self, blocks: &ParBlocks<Self>) {
+        self.y =
+            FieldElement::proc_par_blocks(&self.powers_of_h, self.y, blocks, self.has_intrinsics);
+    }
+}
+
+impl<const N: usize> Reset for PolyvalGeneric<N> {
+    fn reset(&mut self) {
+        self.y = FieldElement::default();
+    }
+}
+
+#[cfg(feature = "zeroize")]
+impl<const N: usize> Drop for PolyvalGeneric<N> {
+    fn drop(&mut self) {
+        self.powers_of_h.zeroize();
+        self.y.zeroize();
+    }
+}
+
+impl<const N: usize> Debug for PolyvalGeneric<N> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
+        write!(f, "PolyvalGeneric<{}> {{ ... }}", N)
+    }
+}