Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions polyval/benches/polyval.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,7 @@

extern crate test;

use polyval::{
Polyval,
universal_hash::{KeyInit, UniversalHash},
};
use polyval::{Polyval, universal_hash::UniversalHash};
use test::Bencher;

// TODO(tarcieri): move this into the `universal-hash` crate
Expand Down
80 changes: 57 additions & 23 deletions polyval/src/field_element.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,15 @@
mod soft;

use crate::{BLOCK_SIZE, Block};
use core::fmt;
use core::fmt::Debug;
use core::ops::{Add, Mul, MulAssign};
use core::{
fmt::{self, Debug},
ops::{Add, Mul, MulAssign},
};
use cpubits::cfg_if;

#[cfg(feature = "zeroize")]
use zeroize::Zeroize;

cfg_if! {
if #[cfg(all(target_arch = "aarch64", not(polyval_backend = "soft")))] {
mod autodetect;
mod armv8;
pub use autodetect::Polyval as PolyvalGeneric;
} else if #[cfg(all(
any(target_arch = "x86_64", target_arch = "x86"),
not(polyval_backend = "soft")
))] {
mod autodetect;
mod x86;
pub use autodetect::Polyval as PolyvalGeneric;
} else {
pub use soft::Polyval as PolyvalGeneric;
}
}

/// An element in POLYVAL's field.
///
/// This type represents an element of the binary field GF(2^128) modulo the irreducible polynomial
Expand All @@ -48,11 +32,62 @@ cfg_if! {
#[repr(C, align(16))] // Make ABI and alignment compatible with SIMD registers
pub(crate) struct FieldElement([u8; BLOCK_SIZE]);

cfg_if! {
if #[cfg(all(target_arch = "aarch64", not(polyval_backend = "soft")))] {
// aarch64
mod autodetect;
mod armv8;
pub(crate) use autodetect::{InitToken, detect_intrinsics};
} else if #[cfg(all(
any(target_arch = "x86_64", target_arch = "x86"),
not(polyval_backend = "soft")
))] {
// x86/x86-64
mod autodetect;
mod x86;
pub(crate) use autodetect::{InitToken, detect_intrinsics};
} else {
// Pure Rust fallback implementation for other targets
use universal_hash::array::{Array, ArraySize};

pub(crate) type InitToken = ();
pub(crate) fn detect_intrinsics() -> (InitToken, bool) {
((), false)
}

impl FieldElement {
/// Default degree of parallelism, i.e. how many powers of `H` to compute.
pub const DEFAULT_PARALLELISM: usize = 8;

/// Process an individual block.
pub(crate) fn proc_block(
h: FieldElement,
y: FieldElement,
x: &Block,
_has_intrinsics: InitToken
) -> FieldElement {
soft::proc_block(h, y, x)
}

/// Process multiple blocks in parallel.
// TODO(tarcieri): currently just calls `proc_block` for each block on `soft`-only
pub(crate) fn proc_par_blocks<const N: usize, U: ArraySize>(
powers_of_h: &[FieldElement; N],
y: FieldElement,
blocks: &Array<Block, U>,
_has_intrinsics: InitToken
) -> FieldElement {
soft::proc_par_blocks(powers_of_h, y, blocks)
}
}
}
}

impl FieldElement {
/// Compute the first N powers of h, in reverse order.
#[inline]
#[allow(dead_code)] // We may not use this in some configurations
fn powers_of_h<const N: usize>(self) -> [Self; N] {
pub(crate) fn powers_of_h<const N: usize>(self) -> [Self; N] {
// TODO: improve pipelining by using more square operations?
let mut pow = [Self::default(); N];
let mut prev = self;
Expand Down Expand Up @@ -144,8 +179,7 @@ impl Mul for FieldElement {

#[inline]
fn mul(self, rhs: Self) -> Self {
let v = soft::karatsuba(self, rhs);
soft::mont_reduce(v)
soft::polymul(self, rhs)
}
}

Expand Down
204 changes: 71 additions & 133 deletions polyval/src/field_element/armv8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,154 +13,96 @@
//! For more information about PMULL, see:
//! - <https://developer.arm.com/documentation/100069/0608/A64-SIMD-Vector-Instructions/PMULL--PMULL2--vector->
//! - <https://eprint.iacr.org/2015/688.pdf>

#![allow(unsafe_op_in_unsafe_fn)]

use super::FieldElement;
use crate::{Block, Key, Tag};
use crate::Block;
use core::{arch::aarch64::*, mem};
use universal_hash::{
KeyInit, ParBlocks, Reset, UhfBackend,
array::ArraySize,
common::{BlockSizeUser, KeySizeUser, ParBlocksSizeUser},
consts::U16,
typenum::{Const, ToUInt, U},
};
use universal_hash::array::{Array, ArraySize};

/// POLYVAL reduction polynomial (`x^128 + x^127 + x^126 + x^121 + 1`) encoded in little-endian
/// GF(2)[x] form with reflected reduction terms arising from folding the upper 128-bits of the
/// product into the lower half during modular reduction.
const POLY: u128 = (1 << 127) | (1 << 126) | (1 << 121) | (1 << 63) | (1 << 62) | (1 << 57);
/// 128-bit SIMD register type.
pub(super) type Simd128 = uint8x16_t;

/// **POLYVAL**: GHASH-like universal hash over GF(2^128).
///
/// Parameterized on a constant that determines how many
/// blocks to process at once: higher numbers use more memory,
/// and require more time to re-key, but process data significantly
/// faster.
/// Perform carryless multiplication of `y` by `h` and return the result.
///
/// (This constant is not used when acceleration is not enabled.)
#[derive(Clone)]
pub struct Polyval<const N: usize = 8> {
/// Powers of H in descending order.
///
/// (H^N, H^(N-1)...H)
h: [FieldElement; N],
y: FieldElement,
}

impl<const N: usize> KeySizeUser for Polyval<N> {
type KeySize = U16;
}

impl<const N: usize> Polyval<N> {
/// Initialize POLYVAL with the given `H` field element and initial block
pub fn new_with_init_block(h: &Key, init_block: u128) -> Self {
Self {
h: FieldElement::from(h).powers_of_h(),
y: init_block.into(),
}
}
}

impl<const N: usize> KeyInit for Polyval<N> {
/// Initialize POLYVAL with the given `H` field element
fn new(h: &Key) -> Self {
Self::new_with_init_block(h, 0)
}
}

impl<const N: usize> BlockSizeUser for Polyval<N> {
type BlockSize = U16;
/// # Safety
/// It is the caller's responsibility to ensure the host CPU is capable of PMULL and NEON
/// instructions.
// TODO(tarcieri): investigate ordering optimizations and fusions e.g.`fuse-crypto-eor`
#[inline]
#[target_feature(enable = "aes,neon")]
pub(super) unsafe fn polymul(y: Simd128, h: Simd128) -> Simd128 {
let (h, m, l) = karatsuba1(h, y);
let (h, l) = karatsuba2(h, m, l);
mont_reduce(h, l)
}

impl<const N: usize> ParBlocksSizeUser for Polyval<N>
where
U<N>: ArraySize,
Const<N>: ToUInt,
{
type ParBlocksSize = U<N>;
/// Process an individual block.
///
/// # Safety
/// It is the caller's responsibility to ensure the host CPU is capable of PMULL and NEON
/// instructions.
#[inline]
#[target_feature(enable = "aes,neon")]
pub(super) unsafe fn proc_block(h: FieldElement, y: FieldElement, x: &Block) -> FieldElement {
let y = veorq_u8(y.into(), vld1q_u8(x.as_ptr()));
polymul(y, h.into()).into()
}

impl<const N: usize> UhfBackend for Polyval<N>
where
U<N>: ArraySize,
Const<N>: ToUInt,
{
fn proc_par_blocks(&mut self, blocks: &ParBlocks<Self>) {
unsafe {
let mut h = vdupq_n_u8(0);
let mut m = vdupq_n_u8(0);
let mut l = vdupq_n_u8(0);

// Note: Manually unrolling this loop did not help in benchmarks.
for i in (0..N).rev() {
let mut x = vld1q_u8(blocks[i].as_ptr());
if i == 0 {
x = veorq_u8(x, self.y.into());
}
let y = self.h[i];
let (hh, mm, ll) = karatsuba1(x, y.into());
h = veorq_u8(h, hh);
m = veorq_u8(m, mm);
l = veorq_u8(l, ll);
/// Process multiple blocks in parallel.
///
/// # Safety
/// It is the caller's responsibility to ensure the host CPU is capable of PMULL and NEON
/// instructions.
#[target_feature(enable = "aes,neon")]
pub(super) unsafe fn proc_par_blocks<const N: usize, U: ArraySize>(
powers_of_h: &[FieldElement; N],
y: FieldElement,
blocks: &Array<Block, U>,
) -> FieldElement {
unsafe {
let mut h = vdupq_n_u8(0);
let mut m = vdupq_n_u8(0);
let mut l = vdupq_n_u8(0);

// Note: Manually unrolling this loop did not help in benchmarks.
for i in (0..N).rev() {
let mut x = vld1q_u8(blocks[i].as_ptr());
if i == 0 {
x = veorq_u8(x, y.into());
}

let (h, l) = karatsuba2(h, m, l);
self.y = mont_reduce(h, l).into();
let (hh, mm, ll) = karatsuba1(x, powers_of_h[i].into());
h = veorq_u8(h, hh);
m = veorq_u8(m, mm);
l = veorq_u8(l, ll);
}
}

fn proc_block(&mut self, x: &Block) {
unsafe {
let y = veorq_u8(self.y.into(), vld1q_u8(x.as_ptr()));
self.y = polymul(y, self.h[N - 1].into()).into();
}
}
}

impl<const N: usize> Reset for Polyval<N> {
fn reset(&mut self) {
self.y = FieldElement::default();
let (h, l) = karatsuba2(h, m, l);
mont_reduce(h, l).into()
}
}

impl<const N: usize> Polyval<N> {
/// Get POLYVAL output.
pub(crate) fn finalize(self) -> Tag {
self.y.into()
}
}

impl From<FieldElement> for uint8x16_t {
impl From<FieldElement> for Simd128 {
#[inline]
fn from(fe: FieldElement) -> uint8x16_t {
fn from(fe: FieldElement) -> Simd128 {
unsafe { vld1q_u8(fe.0.as_ptr()) }
}
}

impl From<uint8x16_t> for FieldElement {
impl From<Simd128> for FieldElement {
#[inline]
fn from(fe: uint8x16_t) -> FieldElement {
fn from(fe: Simd128) -> FieldElement {
let mut ret = FieldElement::default();
unsafe { vst1q_u8(ret.0.as_mut_ptr(), fe) }
ret
}
}

/// Multiply "y" by "h" and return the result.
// TODO(tarcieri): investigate ordering optimizations and fusions e.g.`fuse-crypto-eor`
#[inline]
#[target_feature(enable = "neon")]
unsafe fn polymul(y: uint8x16_t, h: uint8x16_t) -> uint8x16_t {
let (h, m, l) = karatsuba1(h, y);
let (h, l) = karatsuba2(h, m, l);
mont_reduce(h, l)
}

/// Karatsuba decomposition for `x*y`.
#[inline]
#[target_feature(enable = "neon")]
unsafe fn karatsuba1(x: uint8x16_t, y: uint8x16_t) -> (uint8x16_t, uint8x16_t, uint8x16_t) {
#[target_feature(enable = "aes,neon")]
unsafe fn karatsuba1(x: Simd128, y: Simd128) -> (Simd128, Simd128, Simd128) {
// First Karatsuba step: decompose x and y.
//
// (x1*y0 + x0*y1) = (x1+x0) * (y1+y0) + (x1*y1) + (x0*y0)
Expand All @@ -179,7 +121,7 @@ unsafe fn karatsuba1(x: uint8x16_t, y: uint8x16_t) -> (uint8x16_t, uint8x16_t, u
/// Karatsuba combine.
#[inline]
#[target_feature(enable = "neon")]
unsafe fn karatsuba2(h: uint8x16_t, m: uint8x16_t, l: uint8x16_t) -> (uint8x16_t, uint8x16_t) {
unsafe fn karatsuba2(h: Simd128, m: Simd128, l: Simd128) -> (Simd128, Simd128) {
// Second Karatsuba step: combine into a 2n-bit product.
//
// m0 ^= l0 ^ h0 // = m0^(l0^h0)
Expand Down Expand Up @@ -218,9 +160,14 @@ unsafe fn karatsuba2(h: uint8x16_t, m: uint8x16_t, l: uint8x16_t) -> (uint8x16_t
(x23, x01)
}

/// POLYVAL reduction polynomial (`x^128 + x^127 + x^126 + x^121 + 1`) encoded in little-endian
/// GF(2)[x] form with reflected reduction terms arising from folding the upper 128-bits of the
/// product into the lower half during modular reduction.
const POLY: u128 = (1 << 127) | (1 << 126) | (1 << 121) | (1 << 63) | (1 << 62) | (1 << 57);

#[inline]
#[target_feature(enable = "neon")]
unsafe fn mont_reduce(x23: uint8x16_t, x01: uint8x16_t) -> uint8x16_t {
#[target_feature(enable = "aes,neon")]
unsafe fn mont_reduce(x23: Simd128, x01: Simd128) -> Simd128 {
// Perform the Montgomery reduction over the 256-bit X.
// [A1:A0] = X0 • poly
// [B1:B0] = [X0 ⊕ A1 : X1 ⊕ A0]
Expand All @@ -236,8 +183,8 @@ unsafe fn mont_reduce(x23: uint8x16_t, x01: uint8x16_t) -> uint8x16_t {

/// Multiplies the low bits in `a` and `b`.
#[inline]
#[target_feature(enable = "neon")]
unsafe fn pmull(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
#[target_feature(enable = "aes,neon")]
unsafe fn pmull(a: Simd128, b: Simd128) -> Simd128 {
mem::transmute(vmull_p64(
vgetq_lane_u64(vreinterpretq_u64_u8(a), 0),
vgetq_lane_u64(vreinterpretq_u64_u8(b), 0),
Expand All @@ -246,19 +193,10 @@ unsafe fn pmull(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {

/// Multiplies the high bits in `a` and `b`.
#[inline]
#[target_feature(enable = "neon")]
unsafe fn pmull2(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
#[target_feature(enable = "aes,neon")]
unsafe fn pmull2(a: Simd128, b: Simd128) -> Simd128 {
mem::transmute(vmull_p64(
vgetq_lane_u64(vreinterpretq_u64_u8(a), 1),
vgetq_lane_u64(vreinterpretq_u64_u8(b), 1),
))
}
// TODO(tarcieri): zeroize support
// #[cfg(feature = "zeroize")]
// impl Drop for Polyval<N> {
// fn drop(&mut self) {
// use zeroize::Zeroize;
// self.h.zeroize();
// self.y.zeroize();
// }
// }
Loading