diff --git a/go.mod b/go.mod index 5b7ff11519..3e2037f37f 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ toolchain go1.24.1 require ( cloud.google.com/go/compute/metadata v0.7.0 cloud.google.com/go/storage v1.54.0 + github.com/cloudflare/circl v1.6.1 github.com/fsnotify/fsnotify v1.9.0 github.com/golangci/golangci-lint v1.64.8 github.com/google/addlicense v1.1.1 @@ -164,7 +165,6 @@ require ( github.com/ckaznocha/intrange v0.3.0 // indirect github.com/clbanning/mxj/v2 v2.7.0 // indirect github.com/cloudevents/sdk-go/v2 v2.15.2 // indirect - github.com/cloudflare/circl v1.3.7 // indirect github.com/cncf/xds/go v0.0.0-20250326154945-ae57f3c0d45f // indirect github.com/common-nighthawk/go-figure v0.0.0-20210622060536-734e95fb86be // indirect github.com/containerd/stargz-snapshotter/estargz v0.16.3 // indirect diff --git a/go.sum b/go.sum index 2cf9aea9a2..ee241181c2 100644 --- a/go.sum +++ b/go.sum @@ -378,8 +378,8 @@ github.com/clbanning/mxj/v2 v2.7.0/go.mod h1:hNiWqW14h+kc+MdF9C6/YoRfjEJoR3ou6tn github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/cloudevents/sdk-go/v2 v2.15.2 h1:54+I5xQEnI73RBhWHxbI1XJcqOFOVJN85vb41+8mHUc= github.com/cloudevents/sdk-go/v2 v2.15.2/go.mod h1:lL7kSWAE/V8VI4Wh0jbL2v/jvqsm6tjmaQBSvxcv4uE= -github.com/cloudflare/circl v1.3.7 h1:qlCDlTPz2n9fu58M0Nh1J/JzcFpfgkFHHX3O35r5vcU= -github.com/cloudflare/circl v1.3.7/go.mod h1:sRTcRWXGLrKw6yIGJ+l7amYJFfAXbZG0kBSc8r4zxgA= +github.com/cloudflare/circl v1.6.1 h1:zqIqSPIndyBh1bjLVVDHMPpVKqp8Su/V+6MeDzzQBQ0= +github.com/cloudflare/circl v1.6.1/go.mod h1:uddAzsPgqdMAYatqJ0lsjX1oECcQLIlRpzZh3pJrofs= github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= github.com/cncf/udpa/go v0.0.0-20200629203442-efcf912fb354/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk= github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk= diff --git a/pkg/chains/signing/mldsa/mldsa.go b/pkg/chains/signing/mldsa/mldsa.go new file mode 100644 index 0000000000..88a781f52b --- /dev/null +++ b/pkg/chains/signing/mldsa/mldsa.go @@ -0,0 +1,141 @@ +/* +Copyright 2024 The Tekton Authors +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package mldsa + +import ( + "crypto" + "errors" + "io" + + "github.com/cloudflare/circl/sign/mldsa/mldsa65" + "github.com/sigstore/sigstore/pkg/signature" +) + +// SignerVerifier implements signature.SignerVerifier and crypto.Signer for MLDSA +type SignerVerifier struct { + priv *mldsa65.PrivateKey + pub *mldsa65.PublicKey +} + +// LoadSignerVerifier creates a new SignerVerifier from a private key +func LoadSignerVerifier(priv *mldsa65.PrivateKey) (*SignerVerifier, error) { + if priv == nil { + return nil, errors.New("private key cannot be nil") + } + + // Get the public key from the private key + pub := priv.Public().(*mldsa65.PublicKey) + + return &SignerVerifier{ + priv: priv, + pub: pub, + }, nil +} + +// LoadVerifier creates a new SignerVerifier from a public key +func LoadVerifier(pub *mldsa65.PublicKey) (*SignerVerifier, error) { + if pub == nil { + return nil, errors.New("public key cannot be nil") + } + + return &SignerVerifier{ + pub: pub, + }, nil +} + +// Public implements crypto.Signer interface +func (s *SignerVerifier) Public() crypto.PublicKey { + return s.pub +} + +// Sign signs the given data +func (s *SignerVerifier) Sign(data []byte) ([]byte, error) { + if s.priv == nil { + return nil, errors.New("private key not available for signing") + } + + sig := make([]byte, mldsa65.SignatureSize) + err := mldsa65.SignTo(s.priv, data, nil, false, sig) + if err != nil { + return nil, err + } + return sig, nil +} + +// SignWithOpts implements crypto.Signer interface +func (s *SignerVerifier) SignWithOpts(rand io.Reader, digest []byte, opts crypto.SignerOpts) ([]byte, error) { + // MLDSA doesn't use pre-hashing, so we use the input directly + return s.Sign(digest) +} + +// SignMessage signs a message from a reader +func (s *SignerVerifier) SignMessage(message io.Reader, opts ...signature.SignOption) ([]byte, error) { + data, err := io.ReadAll(message) + if err != nil { + return nil, err + } + + return s.Sign(data) +} + +// Verify verifies the signature against the data +func (s *SignerVerifier) Verify(data, sig []byte) error { + if s.pub == nil { + return errors.New("public key not available for verification") + } + + if len(sig) != mldsa65.SignatureSize { + return errors.New("invalid signature size") + } + + if !mldsa65.Verify(s.pub, data, nil, sig) { + return errors.New("invalid signature") + } + + return nil +} + +// VerifySignature verifies a signature from readers +func (s *SignerVerifier) VerifySignature(signature, message io.Reader, opts ...signature.VerifyOption) error { + sig, err := io.ReadAll(signature) + if err != nil { + return err + } + + data, err := io.ReadAll(message) + if err != nil { + return err + } + + return s.Verify(data, sig) +} + +// PublicKey returns the public key with optional parameters +func (s *SignerVerifier) PublicKey(opts ...signature.PublicKeyOption) (crypto.PublicKey, error) { + return s.pub, nil +} + +// Type returns the key type for SSH and other uses +func (s *SignerVerifier) Type() string { + return "mldsa65-sha256" +} + +// CreateKey generates a new key pair +func (s *SignerVerifier) CreateKey(rand io.Reader) (crypto.PublicKey, crypto.PrivateKey, error) { + pub, priv, err := mldsa65.GenerateKey(rand) + if err != nil { + return nil, nil, err + } + return pub, priv, nil +} diff --git a/pkg/chains/signing/wrap.go b/pkg/chains/signing/wrap.go index ee032b1321..8ac7ba2e2c 100644 --- a/pkg/chains/signing/wrap.go +++ b/pkg/chains/signing/wrap.go @@ -17,10 +17,13 @@ import ( "bytes" "context" "crypto" + "crypto/sha256" + "encoding/base64" "encoding/json" "fmt" "io" + "github.com/cloudflare/circl/sign/mldsa/mldsa65" "github.com/in-toto/in-toto-golang/in_toto" "github.com/secure-systems-lab/go-securesystemslib/dsse" "github.com/sigstore/sigstore/pkg/signature" @@ -34,23 +37,40 @@ func Wrap(s Signer) (Signer, error) { return nil, err } - // Generate public key fingerprint - sshpk, err := ssh.NewPublicKey(pub) - if err != nil { - return nil, err + var fingerprint string + var pk crypto.PublicKey + + // Handle MLDSA keys differently + if mldsaPub, ok := pub.(*mldsa65.PublicKey); ok { + // Generate fingerprint from MLDSA public key bytes + pkBytes, err := mldsaPub.MarshalBinary() + if err != nil { + return nil, fmt.Errorf("failed to marshal MLDSA public key: %w", err) + } + hash := sha256.Sum256(pkBytes) + fingerprint = "SHA256:" + base64.StdEncoding.EncodeToString(hash[:]) + pk = pub + } else { + // For other key types, use SSH public key + sshpk, err := ssh.NewPublicKey(pub) + if err != nil { + return nil, err + } + fingerprint = ssh.FingerprintSHA256(sshpk) + pk = sshpk } - fingerprint := ssh.FingerprintSHA256(sshpk) adapter := sslAdapter{ wrapped: s, keyID: fingerprint, - pk: sshpk, + pk: pk, } envelope, err := dsse.NewEnvelopeSigner(&adapter) if err != nil { return nil, err } + return &sslSigner{ wrapper: envelope, typ: s.Type(), diff --git a/pkg/chains/signing/x509/x509.go b/pkg/chains/signing/x509/x509.go index 82e76070ed..6ed0b5ac4d 100644 --- a/pkg/chains/signing/x509/x509.go +++ b/pkg/chains/signing/x509/x509.go @@ -18,6 +18,7 @@ import ( "crypto" "crypto/ecdsa" cx509 "crypto/x509" + "encoding/asn1" "encoding/json" "encoding/pem" "fmt" @@ -34,9 +35,11 @@ import ( "github.com/sigstore/cosign/v2/pkg/providers" "knative.dev/pkg/logging" + "github.com/cloudflare/circl/sign/mldsa/mldsa65" "github.com/sigstore/sigstore/pkg/signature" "github.com/sigstore/sigstore/pkg/tuf" "github.com/tektoncd/chains/pkg/chains/signing" + "github.com/tektoncd/chains/pkg/chains/signing/mldsa" "github.com/tektoncd/chains/pkg/config" ) @@ -44,6 +47,20 @@ const ( defaultOIDCClientID = "sigstore" ) +// MLDSA65 OID: 2.16.840.1.101.3.4.3.18 +var mldsaOID = asn1.ObjectIdentifier{2, 16, 840, 1, 101, 3, 4, 3, 18} + +type pkcs8 struct { + Version int + Algorithm pkcs8Algorithm + PrivateKey []byte +} + +type pkcs8Algorithm struct { + Algorithm asn1.ObjectIdentifier + Parameters asn1.RawValue `asn1:"optional"` +} + // Signer exposes methods to sign payloads. type Signer struct { cert string @@ -175,23 +192,82 @@ func loadRootFromURL(root string) ([]byte, error) { return io.ReadAll(resp.Body) } +func extractMLDSAFromPKCS8(der []byte) (*mldsa65.PrivateKey, error) { + // PKCS#8 structure typically has the raw key at the end + // For MLDSA65, we need exactly mldsa65.PrivateKeySize bytes + + if len(der) < mldsa65.PrivateKeySize { + return nil, fmt.Errorf("PKCS#8 data too short: %d bytes, need at least %d", + len(der), mldsa65.PrivateKeySize) + } + + // Strategy 1: Try the last PrivateKeySize bytes (most common case) + if len(der) >= mldsa65.PrivateKeySize { + rawKey := der[len(der)-mldsa65.PrivateKeySize:] + var mldsaKey mldsa65.PrivateKey + if err := mldsaKey.UnmarshalBinary(rawKey); err == nil { + return &mldsaKey, nil + } + } + return nil, fmt.Errorf("no valid MLDSA key found in PKCS#8 data") +} + +func tryLoadMLDSA(data []byte) (*mldsa65.PrivateKey, error) { + // First try direct raw format + var mldsaKey mldsa65.PrivateKey + if err := mldsaKey.UnmarshalBinary(data); err == nil { + return &mldsaKey, nil + } + + // Then try to extract from PKCS#8 + if key, err := extractMLDSAFromPKCS8(data); err == nil { + return key, nil + } + + return nil, fmt.Errorf("data is neither raw MLDSA key nor PKCS#8 wrapped MLDSA key") +} + func x509Signer(ctx context.Context, privateKey []byte) (*Signer, error) { logger := logging.FromContext(ctx) logger.Info("Found x509 key...") p, _ := pem.Decode(privateKey) - if p.Type != "PRIVATE KEY" { - return nil, fmt.Errorf("expected private key, found object of type %s", p.Type) - } - pk, err := cx509.ParsePKCS8PrivateKey(p.Bytes) - if err != nil { - return nil, err + if p == nil { + return nil, fmt.Errorf("failed to decode PEM block") } - signer, err := signature.LoadECDSASignerVerifier(pk.(*ecdsa.PrivateKey), crypto.SHA256) - if err != nil { - return nil, err + + logger.Infof("Attempting to parse private key of type: %s", p.Type) + + switch p.Type { + case "PRIVATE KEY": + // Try PKCS#8 first for ECDSA keys + if pk, err := cx509.ParsePKCS8PrivateKey(p.Bytes); err == nil { + if ecKey, ok := pk.(*ecdsa.PrivateKey); ok { + logger.Info("Using ECDSA private key...") + signer, err := signature.LoadECDSASignerVerifier(ecKey, crypto.SHA256) + if err != nil { + return nil, fmt.Errorf("failed to load ECDSA signer: %w", err) + } + return &Signer{SignerVerifier: signer}, nil + } + } + + // Try MLDSA formats + if mldsaKey, err := tryLoadMLDSA(p.Bytes); err == nil { + logger.Info("Using MLDSA private key...") + signer, err := mldsa.LoadSignerVerifier(mldsaKey) + if err != nil { + return nil, fmt.Errorf("failed to load MLDSA signer: %w", err) + } + return &Signer{SignerVerifier: signer}, nil + } else { + logger.Infof("Failed to load MLDSA key: %v", err) + } + + return nil, fmt.Errorf("unsupported private key format - key could not be parsed as PKCS#8 ECDSA or MLDSA") + default: + return nil, fmt.Errorf("expected private key, found object of type %s", p.Type) } - return &Signer{SignerVerifier: signer}, nil } func cosignSigner(ctx context.Context, secretPath string, privateKey []byte) (*Signer, error) { diff --git a/vendor/github.com/cloudflare/circl/dh/x25519/curve_amd64.s b/vendor/github.com/cloudflare/circl/dh/x25519/curve_amd64.s index b7723185b6..ce9f062894 100644 --- a/vendor/github.com/cloudflare/circl/dh/x25519/curve_amd64.s +++ b/vendor/github.com/cloudflare/circl/dh/x25519/curve_amd64.s @@ -1,4 +1,5 @@ -// +build amd64 +//go:build amd64 && !purego +// +build amd64,!purego #include "textflag.h" diff --git a/vendor/github.com/cloudflare/circl/dh/x448/curve_amd64.s b/vendor/github.com/cloudflare/circl/dh/x448/curve_amd64.s index 810aa9e648..ed33ba3d03 100644 --- a/vendor/github.com/cloudflare/circl/dh/x448/curve_amd64.s +++ b/vendor/github.com/cloudflare/circl/dh/x448/curve_amd64.s @@ -1,4 +1,5 @@ -// +build amd64 +//go:build amd64 && !purego +// +build amd64,!purego #include "textflag.h" diff --git a/vendor/github.com/cloudflare/circl/ecc/goldilocks/curve.go b/vendor/github.com/cloudflare/circl/ecc/goldilocks/curve.go index 5a939100d2..1f165141a9 100644 --- a/vendor/github.com/cloudflare/circl/ecc/goldilocks/curve.go +++ b/vendor/github.com/cloudflare/circl/ecc/goldilocks/curve.go @@ -18,6 +18,9 @@ func (Curve) Identity() *Point { func (Curve) IsOnCurve(P *Point) bool { x2, y2, t, t2, z2 := &fp.Elt{}, &fp.Elt{}, &fp.Elt{}, &fp.Elt{}, &fp.Elt{} rhs, lhs := &fp.Elt{}, &fp.Elt{} + // Check z != 0 + eq0 := !fp.IsZero(&P.z) + fp.Mul(t, &P.ta, &P.tb) // t = ta*tb fp.Sqr(x2, &P.x) // x^2 fp.Sqr(y2, &P.y) // y^2 @@ -27,13 +30,14 @@ func (Curve) IsOnCurve(P *Point) bool { fp.Mul(rhs, t2, ¶mD) // dt^2 fp.Add(rhs, rhs, z2) // z^2 + dt^2 fp.Sub(lhs, lhs, rhs) // x^2 + y^2 - (z^2 + dt^2) - eq0 := fp.IsZero(lhs) + eq1 := fp.IsZero(lhs) fp.Mul(lhs, &P.x, &P.y) // xy fp.Mul(rhs, t, &P.z) // tz fp.Sub(lhs, lhs, rhs) // xy - tz - eq1 := fp.IsZero(lhs) - return eq0 && eq1 + eq2 := fp.IsZero(lhs) + + return eq0 && eq1 && eq2 } // Generator returns the generator point. diff --git a/vendor/github.com/cloudflare/circl/internal/conv/conv.go b/vendor/github.com/cloudflare/circl/internal/conv/conv.go index 649a8e931d..3fd0df496f 100644 --- a/vendor/github.com/cloudflare/circl/internal/conv/conv.go +++ b/vendor/github.com/cloudflare/circl/internal/conv/conv.go @@ -5,6 +5,8 @@ import ( "fmt" "math/big" "strings" + + "golang.org/x/crypto/cryptobyte" ) // BytesLe2Hex returns an hexadecimal string of a number stored in a @@ -138,3 +140,34 @@ func BigInt2Uint64Le(z []uint64, x *big.Int) { z[i] = 0 } } + +// MarshalBinary encodes a value into a byte array in a format readable by UnmarshalBinary. +func MarshalBinary(v cryptobyte.MarshalingValue) ([]byte, error) { + const DefaultSize = 32 + b := cryptobyte.NewBuilder(make([]byte, 0, DefaultSize)) + b.AddValue(v) + return b.Bytes() +} + +// MarshalBinaryLen encodes a value into an array of n bytes in a format readable by UnmarshalBinary. +func MarshalBinaryLen(v cryptobyte.MarshalingValue, length uint) ([]byte, error) { + b := cryptobyte.NewFixedBuilder(make([]byte, 0, length)) + b.AddValue(v) + return b.Bytes() +} + +// A UnmarshalingValue decodes itself from a cryptobyte.String and advances the pointer. +// It reports whether the read was successful. +type UnmarshalingValue interface { + Unmarshal(*cryptobyte.String) bool +} + +// UnmarshalBinary recovers a value from a byte array. +// It returns an error if the read was unsuccessful. +func UnmarshalBinary(v UnmarshalingValue, data []byte) (err error) { + s := cryptobyte.String(data) + if data == nil || !v.Unmarshal(&s) || !s.Empty() { + err = fmt.Errorf("cannot read %T from input string", v) + } + return +} diff --git a/vendor/github.com/cloudflare/circl/math/fp25519/fp_amd64.s b/vendor/github.com/cloudflare/circl/math/fp25519/fp_amd64.s index 5c4aeddecb..1fcc2dee17 100644 --- a/vendor/github.com/cloudflare/circl/math/fp25519/fp_amd64.s +++ b/vendor/github.com/cloudflare/circl/math/fp25519/fp_amd64.s @@ -1,4 +1,5 @@ -// +build amd64 +//go:build amd64 && !purego +// +build amd64,!purego #include "textflag.h" #include "fp_amd64.h" diff --git a/vendor/github.com/cloudflare/circl/math/fp448/fp_amd64.s b/vendor/github.com/cloudflare/circl/math/fp448/fp_amd64.s index 435addf5e6..3f1f07c986 100644 --- a/vendor/github.com/cloudflare/circl/math/fp448/fp_amd64.s +++ b/vendor/github.com/cloudflare/circl/math/fp448/fp_amd64.s @@ -1,4 +1,5 @@ -// +build amd64 +//go:build amd64 && !purego +// +build amd64,!purego #include "textflag.h" #include "fp_amd64.h" diff --git a/vendor/github.com/cloudflare/circl/math/integer.go b/vendor/github.com/cloudflare/circl/math/integer.go new file mode 100644 index 0000000000..9c80c23b59 --- /dev/null +++ b/vendor/github.com/cloudflare/circl/math/integer.go @@ -0,0 +1,16 @@ +package math + +import "math/bits" + +// NextPow2 finds the next power of two (N=2^k, k>=0) greater than n. +// If n is already a power of two, then this function returns n, and log2(n). +func NextPow2(n uint) (N uint, k uint) { + if bits.OnesCount(n) == 1 { + k = uint(bits.TrailingZeros(n)) + N = n + } else { + k = uint(bits.Len(n)) + N = uint(1) << k + } + return +} diff --git a/vendor/github.com/cloudflare/circl/sign/ed25519/point.go b/vendor/github.com/cloudflare/circl/sign/ed25519/point.go index 374a69503c..d1c3b146b7 100644 --- a/vendor/github.com/cloudflare/circl/sign/ed25519/point.go +++ b/vendor/github.com/cloudflare/circl/sign/ed25519/point.go @@ -164,7 +164,7 @@ func (P *pointR1) isEqual(Q *pointR1) bool { fp.Mul(r, r, &P.z) fp.Sub(l, l, r) b = b && fp.IsZero(l) - return b + return b && !fp.IsZero(&P.z) && !fp.IsZero(&Q.z) } func (P *pointR3) neg() { diff --git a/vendor/github.com/cloudflare/circl/sign/ed448/ed448.go b/vendor/github.com/cloudflare/circl/sign/ed448/ed448.go index 324bd8f334..c368b181b4 100644 --- a/vendor/github.com/cloudflare/circl/sign/ed448/ed448.go +++ b/vendor/github.com/cloudflare/circl/sign/ed448/ed448.go @@ -206,7 +206,7 @@ func newKeyFromSeed(privateKey, seed []byte) { func signAll(signature []byte, privateKey PrivateKey, message, ctx []byte, preHash bool) { if len(ctx) > ContextMaxSize { - panic(fmt.Errorf("ed448: bad context length: " + strconv.Itoa(len(ctx)))) + panic(fmt.Errorf("ed448: bad context length: %v", len(ctx))) } H := sha3.NewShake256() diff --git a/vendor/github.com/cloudflare/circl/sign/internal/dilithium/amd64.go b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/amd64.go new file mode 100644 index 0000000000..d5d224ee84 --- /dev/null +++ b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/amd64.go @@ -0,0 +1,154 @@ +//go:build amd64 && !purego +// +build amd64,!purego + +package dilithium + +import ( + "golang.org/x/sys/cpu" +) + +// Execute an in-place forward NTT on as. +// +// Assumes the coefficients are in Montgomery representation and bounded +// by 2*Q. The resulting coefficients are again in Montgomery representation, +// but are only bounded bt 18*Q. +func (p *Poly) NTT() { + if cpu.X86.HasAVX2 { + nttAVX2( + (*[N]uint32)(p), + ) + } else { + p.nttGeneric() + } +} + +// Execute an in-place inverse NTT and multiply by Montgomery factor R +// +// Assumes the coefficients are in Montgomery representation and bounded +// by 2*Q. The resulting coefficients are again in Montgomery representation +// and bounded by 2*Q. +func (p *Poly) InvNTT() { + if cpu.X86.HasAVX2 { + invNttAVX2( + (*[N]uint32)(p), + ) + } else { + p.invNttGeneric() + } +} + +// Sets p to the polynomial whose coefficients are the pointwise multiplication +// of those of a and b. The coefficients of p are bounded by 2q. +// +// Assumes a and b are in Montgomery form and that the pointwise product +// of each coefficient is below 2³² q. +func (p *Poly) MulHat(a, b *Poly) { + if cpu.X86.HasAVX2 { + mulHatAVX2( + (*[N]uint32)(p), + (*[N]uint32)(a), + (*[N]uint32)(b), + ) + } else { + p.mulHatGeneric(a, b) + } +} + +// Sets p to a + b. Does not normalize polynomials. +func (p *Poly) Add(a, b *Poly) { + if cpu.X86.HasAVX2 { + addAVX2( + (*[N]uint32)(p), + (*[N]uint32)(a), + (*[N]uint32)(b), + ) + } else { + p.addGeneric(a, b) + } +} + +// Sets p to a - b. +// +// Warning: assumes coefficients of b are less than 2q. +// Sets p to a + b. Does not normalize polynomials. +func (p *Poly) Sub(a, b *Poly) { + if cpu.X86.HasAVX2 { + subAVX2( + (*[N]uint32)(p), + (*[N]uint32)(a), + (*[N]uint32)(b), + ) + } else { + p.subGeneric(a, b) + } +} + +// Writes p whose coefficients are in [0, 16) to buf, which must be of +// length N/2. +func (p *Poly) PackLe16(buf []byte) { + if cpu.X86.HasAVX2 { + if len(buf) < PolyLe16Size { + panic("buf too small") + } + packLe16AVX2( + (*[N]uint32)(p), + &buf[0], + ) + } else { + p.packLe16Generic(buf) + } +} + +// Reduces each of the coefficients to <2q. +func (p *Poly) ReduceLe2Q() { + if cpu.X86.HasAVX2 { + reduceLe2QAVX2((*[N]uint32)(p)) + } else { + p.reduceLe2QGeneric() + } +} + +// Reduce each of the coefficients to > 23 + x2 := x & 0x7FFFFF // 2²³-1 + return x2 + (x1 << 13) - x1 +} + +// Returns x mod q. +func modQ(x uint32) uint32 { + return le2qModQ(ReduceLe2Q(x)) +} + +// For x R ≤ q 2³², find y ≤ 2q with y = x mod q. +func montReduceLe2Q(x uint64) uint32 { + // Qinv = 4236238847 = -(q⁻¹) mod 2³² + m := (x * Qinv) & 0xffffffff + return uint32((x + m*uint64(Q)) >> 32) +} + +// Returns x mod q for 0 ≤ x < 2q. +func le2qModQ(x uint32) uint32 { + x -= Q + mask := uint32(int32(x) >> 31) // mask is 2³²-1 if x was neg.; 0 otherwise + return x + (mask & Q) +} + +// Splits 0 ≤ a < Q into a0 and a1 with a = a1*2ᴰ + a0 +// and -2ᴰ⁻¹ < a0 < 2ᴰ⁻¹. Returns a0 + Q and a1. +func power2round(a uint32) (a0plusQ, a1 uint32) { + // We effectively compute a0 = a mod± 2ᵈ + // and a1 = (a - a0) / 2ᵈ. + a0 := a & ((1 << D) - 1) // a mod 2ᵈ + + // a0 is one of 0, 1, ..., 2ᵈ⁻¹-1, 2ᵈ⁻¹, 2ᵈ⁻¹+1, ..., 2ᵈ-1 + a0 -= (1 << (D - 1)) + 1 + // now a0 is -2ᵈ⁻¹-1, -2ᵈ⁻¹, ..., -2, -1, 0, ..., 2ᵈ⁻¹-2 + // Next, we add 2ᴰ to those a0 that are negative (seen as int32). + a0 += uint32(int32(a0)>>31) & (1 << D) + // now a0 is 2ᵈ⁻¹-1, 2ᵈ⁻¹, ..., 2ᵈ-2, 2ᵈ-1, 0, ..., 2ᵈ⁻¹-2 + a0 -= (1 << (D - 1)) - 1 + // now a0 id 0, 1, 2, ..., 2ᵈ⁻¹-1, 2ᵈ⁻¹-1, -2ᵈ⁻¹-1, ... + // which is what we want. + a0plusQ = Q + a0 + a1 = (a - a0) >> D + return +} diff --git a/vendor/github.com/cloudflare/circl/sign/internal/dilithium/generic.go b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/generic.go new file mode 100644 index 0000000000..25321f5d55 --- /dev/null +++ b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/generic.go @@ -0,0 +1,81 @@ +//go:build !amd64 || purego +// +build !amd64 purego + +package dilithium + +// Execute an in-place forward NTT on as. +// +// Assumes the coefficients are in Montgomery representation and bounded +// by 2*Q. The resulting coefficients are again in Montgomery representation, +// but are only bounded bt 18*Q. +func (p *Poly) NTT() { + p.nttGeneric() +} + +// Execute an in-place inverse NTT and multiply by Montgomery factor R +// +// Assumes the coefficients are in Montgomery representation and bounded +// by 2*Q. The resulting coefficients are again in Montgomery representation +// and bounded by 2*Q. +func (p *Poly) InvNTT() { + p.invNttGeneric() +} + +// Sets p to the polynomial whose coefficients are the pointwise multiplication +// of those of a and b. The coefficients of p are bounded by 2q. +// +// Assumes a and b are in Montgomery form and that the pointwise product +// of each coefficient is below 2³² q. +func (p *Poly) MulHat(a, b *Poly) { + p.mulHatGeneric(a, b) +} + +// Sets p to a + b. Does not normalize polynomials. +func (p *Poly) Add(a, b *Poly) { + p.addGeneric(a, b) +} + +// Sets p to a - b. +// +// Warning: assumes coefficients of b are less than 2q. +// Sets p to a + b. Does not normalize polynomials. +func (p *Poly) Sub(a, b *Poly) { + p.subGeneric(a, b) +} + +// Writes p whose coefficients are in [0, 16) to buf, which must be of +// length N/2. +func (p *Poly) PackLe16(buf []byte) { + p.packLe16Generic(buf) +} + +// Reduces each of the coefficients to <2q. +func (p *Poly) ReduceLe2Q() { + p.reduceLe2QGeneric() +} + +// Reduce each of the coefficients to 0; l >>= 1 { + // On the n-th iteration of the l-loop, the coefficients start off + // bounded by n*2*Q. + // + // offset effectively loops over the row groups in this column; it + // is the first row in the row group. + for offset := uint(0); offset < N-l; offset += 2 * l { + k++ + zeta := uint64(Zetas[k]) + + // j loops over each butterfly in the row group. + for j := offset; j < offset+l; j++ { + t := montReduceLe2Q(zeta * uint64(p[j+l])) + p[j+l] = p[j] + (2*Q - t) // Cooley--Tukey butterfly + p[j] += t + } + } + } +} + +// Execute an in-place inverse NTT and multiply by Montgomery factor R +// +// Assumes the coefficients are in Montgomery representation and bounded +// by 2*Q. The resulting coefficients are again in Montgomery representation +// and bounded by 2*Q. +func (p *Poly) invNttGeneric() { + k := 0 // Index into InvZetas + + // We basically do the opposite of NTT, but postpone dividing by 2 in the + // inverse of the Cooley--Tukey butterfly and accumulate that to a big + // division by 2⁸ at the end. See comments in the NTT() function. + + for l := uint(1); l < N; l <<= 1 { + // On the n-th iteration of the l-loop, the coefficients start off + // bounded by 2ⁿ⁻¹*2*Q, so by 256*Q on the last. + for offset := uint(0); offset < N-l; offset += 2 * l { + zeta := uint64(InvZetas[k]) + k++ + for j := offset; j < offset+l; j++ { + t := p[j] // Gentleman--Sande butterfly + p[j] = t + p[j+l] + t += 256*Q - p[j+l] + p[j+l] = montReduceLe2Q(zeta * uint64(t)) + } + } + } + + for j := uint(0); j < N; j++ { + // ROver256 = 41978 = (256)⁻¹ R² + p[j] = montReduceLe2Q(ROver256 * uint64(p[j])) + } +} diff --git a/vendor/github.com/cloudflare/circl/sign/internal/dilithium/pack.go b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/pack.go new file mode 100644 index 0000000000..4b952a004f --- /dev/null +++ b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/pack.go @@ -0,0 +1,160 @@ +package dilithium + +// Sets p to the polynomial whose coefficients are less than 1024 encoded +// into buf (which must be of size PolyT1Size). +// +// p will be normalized. +func (p *Poly) UnpackT1(buf []byte) { + j := 0 + for i := 0; i < PolyT1Size; i += 5 { + p[j] = (uint32(buf[i]) | (uint32(buf[i+1]) << 8)) & 0x3ff + p[j+1] = (uint32(buf[i+1]>>2) | (uint32(buf[i+2]) << 6)) & 0x3ff + p[j+2] = (uint32(buf[i+2]>>4) | (uint32(buf[i+3]) << 4)) & 0x3ff + p[j+3] = (uint32(buf[i+3]>>6) | (uint32(buf[i+4]) << 2)) & 0x3ff + j += 4 + } +} + +// Writes p whose coefficients are in (-2ᵈ⁻¹, 2ᵈ⁻¹] into buf which +// has to be of length at least PolyT0Size. +// +// Assumes that the coefficients are not normalized, but lie in the +// range (q-2ᵈ⁻¹, q+2ᵈ⁻¹]. +func (p *Poly) PackT0(buf []byte) { + j := 0 + for i := 0; i < PolyT0Size; i += 13 { + p0 := Q + (1 << (D - 1)) - p[j] + p1 := Q + (1 << (D - 1)) - p[j+1] + p2 := Q + (1 << (D - 1)) - p[j+2] + p3 := Q + (1 << (D - 1)) - p[j+3] + p4 := Q + (1 << (D - 1)) - p[j+4] + p5 := Q + (1 << (D - 1)) - p[j+5] + p6 := Q + (1 << (D - 1)) - p[j+6] + p7 := Q + (1 << (D - 1)) - p[j+7] + + buf[i] = byte(p0 >> 0) + buf[i+1] = byte(p0>>8) | byte(p1<<5) + buf[i+2] = byte(p1 >> 3) + buf[i+3] = byte(p1>>11) | byte(p2<<2) + buf[i+4] = byte(p2>>6) | byte(p3<<7) + buf[i+5] = byte(p3 >> 1) + buf[i+6] = byte(p3>>9) | byte(p4<<4) + buf[i+7] = byte(p4 >> 4) + buf[i+8] = byte(p4>>12) | byte(p5<<1) + buf[i+9] = byte(p5>>7) | byte(p6<<6) + buf[i+10] = byte(p6 >> 2) + buf[i+11] = byte(p6>>10) | byte(p7<<3) + buf[i+12] = byte(p7 >> 5) + j += 8 + } +} + +// Sets p to the polynomial packed into buf by PackT0. +// +// The coefficients of p will not be normalized, but will lie +// in (-2ᵈ⁻¹, 2ᵈ⁻¹]. +func (p *Poly) UnpackT0(buf []byte) { + j := 0 + for i := 0; i < PolyT0Size; i += 13 { + p[j] = Q + (1 << (D - 1)) - ((uint32(buf[i]) | + (uint32(buf[i+1]) << 8)) & 0x1fff) + p[j+1] = Q + (1 << (D - 1)) - (((uint32(buf[i+1]) >> 5) | + (uint32(buf[i+2]) << 3) | + (uint32(buf[i+3]) << 11)) & 0x1fff) + p[j+2] = Q + (1 << (D - 1)) - (((uint32(buf[i+3]) >> 2) | + (uint32(buf[i+4]) << 6)) & 0x1fff) + p[j+3] = Q + (1 << (D - 1)) - (((uint32(buf[i+4]) >> 7) | + (uint32(buf[i+5]) << 1) | + (uint32(buf[i+6]) << 9)) & 0x1fff) + p[j+4] = Q + (1 << (D - 1)) - (((uint32(buf[i+6]) >> 4) | + (uint32(buf[i+7]) << 4) | + (uint32(buf[i+8]) << 12)) & 0x1fff) + p[j+5] = Q + (1 << (D - 1)) - (((uint32(buf[i+8]) >> 1) | + (uint32(buf[i+9]) << 7)) & 0x1fff) + p[j+6] = Q + (1 << (D - 1)) - (((uint32(buf[i+9]) >> 6) | + (uint32(buf[i+10]) << 2) | + (uint32(buf[i+11]) << 10)) & 0x1fff) + p[j+7] = Q + (1 << (D - 1)) - ((uint32(buf[i+11]) >> 3) | + (uint32(buf[i+12]) << 5)) + + j += 8 + } +} + +// Writes p whose coefficients are less than 1024 into buf, which must be +// of size at least PolyT1Size . +// +// Assumes coefficients of p are normalized. +func (p *Poly) PackT1(buf []byte) { + j := 0 + for i := 0; i < PolyT1Size; i += 5 { + buf[i] = byte(p[j]) + buf[i+1] = byte(p[j]>>8) | byte(p[j+1]<<2) + buf[i+2] = byte(p[j+1]>>6) | byte(p[j+2]<<4) + buf[i+3] = byte(p[j+2]>>4) | byte(p[j+3]<<6) + buf[i+4] = byte(p[j+3] >> 2) + j += 4 + } +} + +// Writes p whose coefficients are in [0, 16) to buf, which must be of +// length N/2. +func (p *Poly) packLe16Generic(buf []byte) { + j := 0 + for i := 0; i < PolyLe16Size; i++ { + buf[i] = byte(p[j]) | byte(p[j+1]<<4) + j += 2 + } +} + +// Writes p with 60 non-zero coefficients {-1,1} to buf, which must have +// length 40. +func (p *Poly) PackB60(buf []byte) { + // We start with a mask of the non-zero positions of p (which is 32 bytes) + // and then append 60 packed bits, where a one indicates a negative + // coefficients. + var signs uint64 + mask := uint64(1) + for i := 0; i < 32; i++ { + buf[i] = 0 + for j := 0; j < 8; j++ { + if p[8*i+j] != 0 { + buf[i] |= 1 << uint(j) + if p[8*i+j] == Q-1 { + signs |= mask + } + mask <<= 1 + } + } + } + for i := uint64(0); i < 8; i++ { + buf[i+32] = uint8(signs >> (8 * i)) + } +} + +// UnpackB60 sets p to the polynomial packed into buf with Poly.PackB60(). +// +// Returns whether unpacking was successful. +func (p *Poly) UnpackB60(buf []byte) bool { + *p = Poly{} // zero p + signs := (uint64(buf[32]) | (uint64(buf[33]) << 8) | + (uint64(buf[34]) << 16) | (uint64(buf[35]) << 24) | + (uint64(buf[36]) << 32) | (uint64(buf[37]) << 40) | + (uint64(buf[38]) << 48) | (uint64(buf[39]) << 56)) + if signs>>60 != 0 { + return false // ensure unused bits are zero for strong unforgeability + } + + for i := 0; i < 32; i++ { + for j := 0; j < 8; j++ { + if (buf[i]>>uint(j))&1 == 1 { + p[8*i+j] = 1 + // Note 1 ^ (1 | (Q-1)) = Q-1 and (-1)&x = x + p[8*i+j] ^= uint32(-(signs & 1)) & (1 | (Q - 1)) + signs >>= 1 + } + } + } + + return true +} diff --git a/vendor/github.com/cloudflare/circl/sign/internal/dilithium/params.go b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/params.go new file mode 100644 index 0000000000..f423217f02 --- /dev/null +++ b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/params.go @@ -0,0 +1,18 @@ +package dilithium + +import ( + "github.com/cloudflare/circl/sign/internal/dilithium/params" +) + +const ( + SeedSize = params.SeedSize + N = params.N + Q = params.Q + QBits = params.QBits + Qinv = params.Qinv + ROver256 = params.ROver256 + D = params.D + PolyT1Size = params.PolyT1Size + PolyT0Size = params.PolyT0Size + PolyLe16Size = params.PolyLe16Size +) diff --git a/vendor/github.com/cloudflare/circl/sign/internal/dilithium/params/params.go b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/params/params.go new file mode 100644 index 0000000000..2df20e3a40 --- /dev/null +++ b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/params/params.go @@ -0,0 +1,25 @@ +package params + +// We put these parameters in a separate package so that the Go code, +// such as ntt_amd64_src.go, that generates assembler can import it. + +const ( + SeedSize = 32 + N = 256 + Q = 8380417 // 2²³ - 2¹³ + 1 + QBits = 23 + Qinv = 4236238847 // = -(q^-1) mod 2³² + ROver256 = 41978 // = (256)⁻¹ R² mod q, where R=2³² + D = 13 + + // Size of T1 packed. (Note that the formula is not valid in general, + // but it is for the parameters used in the modes of Dilithium.) + PolyT1Size = (N * (QBits - D)) / 8 + + // Size of T0 packed. (Note that the formula is not valid in general, + // but it is for the parameters used in the modes of Dilithium.) + PolyT0Size = (N * D) / 8 + + // Size of a packed polynomial whose coefficients are in [0,16). + PolyLe16Size = N / 2 +) diff --git a/vendor/github.com/cloudflare/circl/sign/internal/dilithium/poly.go b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/poly.go new file mode 100644 index 0000000000..96c0551b38 --- /dev/null +++ b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/poly.go @@ -0,0 +1,101 @@ +package dilithium + +// An element of our base ring R which are polynomials over Z_q modulo +// the equation Xᴺ = -1, where q=2²³ - 2¹³ + 1 and N=256. +// +// Coefficients aren't always reduced. See Normalize(). +type Poly [N]uint32 + +// Reduces each of the coefficients to <2q. +func (p *Poly) reduceLe2QGeneric() { + for i := uint(0); i < N; i++ { + p[i] = ReduceLe2Q(p[i]) + } +} + +// Reduce each of the coefficients to > 31) + // Sets x to {0, 1, ..., (Q-1)/2, (Q-1)/2, ..., 1} + x = int32((Q-1)/2) - x + if uint32(x) >= bound { + return true + } + } + return false +} + +// Splits p into p1 and p0 such that [i]p1 * 2ᴰ + [i]p0 = [i]p +// with -2ᴰ⁻¹ < [i]p0 ≤ 2ᴰ⁻¹. Returns p0 + Q and p1. +// +// Requires the coefficients of p to be normalized. +func (p *Poly) Power2Round(p0PlusQ, p1 *Poly) { + for i := 0; i < N; i++ { + p0PlusQ[i], p1[i] = power2round(p[i]) + } +} + +// Sets p to the polynomial whose coefficients are the pointwise multiplication +// of those of a and b. The coefficients of p are bounded by 2q. +// +// Assumes a and b are in Montgomery form and that the pointwise product +// of each coefficient is below 2³² q. +func (p *Poly) mulHatGeneric(a, b *Poly) { + for i := 0; i < N; i++ { + p[i] = montReduceLe2Q(uint64(a[i]) * uint64(b[i])) + } +} + +// Sets p to 2ᵈ q without reducing. +// +// So it requires the coefficients of p to be less than 2³²⁻ᴰ. +func (p *Poly) mulBy2toDGeneric(q *Poly) { + for i := 0; i < N; i++ { + p[i] = q[i] << D + } +} diff --git a/vendor/github.com/cloudflare/circl/sign/internal/dilithium/stubs_amd64.go b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/stubs_amd64.go new file mode 100644 index 0000000000..ca92f15ef1 --- /dev/null +++ b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/stubs_amd64.go @@ -0,0 +1,35 @@ +// Code generated by command: go run src.go -out ../amd64.s -stubs ../stubs_amd64.go -pkg dilithium. DO NOT EDIT. + +//go:build amd64 && !purego + +package dilithium + +//go:noescape +func nttAVX2(p *[256]uint32) + +//go:noescape +func invNttAVX2(p *[256]uint32) + +//go:noescape +func mulHatAVX2(p *[256]uint32, a *[256]uint32, b *[256]uint32) + +//go:noescape +func addAVX2(p *[256]uint32, a *[256]uint32, b *[256]uint32) + +//go:noescape +func subAVX2(p *[256]uint32, a *[256]uint32, b *[256]uint32) + +//go:noescape +func packLe16AVX2(p *[256]uint32, buf *byte) + +//go:noescape +func reduceLe2QAVX2(p *[256]uint32) + +//go:noescape +func le2qModQAVX2(p *[256]uint32) + +//go:noescape +func exceedsAVX2(p *[256]uint32, bound uint32) uint8 + +//go:noescape +func mulBy2toDAVX2(p *[256]uint32, q *[256]uint32) diff --git a/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/dilithium.go b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/dilithium.go new file mode 100644 index 0000000000..23a7b9a1f8 --- /dev/null +++ b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/dilithium.go @@ -0,0 +1,361 @@ +// Code generated from pkg.templ.go. DO NOT EDIT. + +// mldsa65 implements NIST signature scheme ML-DSA-65 as defined in FIPS204. +package mldsa65 + +import ( + "crypto" + cryptoRand "crypto/rand" + "encoding/asn1" + "errors" + "io" + + "github.com/cloudflare/circl/sign" + common "github.com/cloudflare/circl/sign/internal/dilithium" + "github.com/cloudflare/circl/sign/mldsa/mldsa65/internal" +) + +const ( + // Size of seed for NewKeyFromSeed + SeedSize = common.SeedSize + + // Size of a packed PublicKey + PublicKeySize = internal.PublicKeySize + + // Size of a packed PrivateKey + PrivateKeySize = internal.PrivateKeySize + + // Size of a signature + SignatureSize = internal.SignatureSize +) + +// PublicKey is the type of ML-DSA-65 public key +type PublicKey internal.PublicKey + +// PrivateKey is the type of ML-DSA-65 private key +type PrivateKey internal.PrivateKey + +// GenerateKey generates a public/private key pair using entropy from rand. +// If rand is nil, crypto/rand.Reader will be used. +func GenerateKey(rand io.Reader) (*PublicKey, *PrivateKey, error) { + pk, sk, err := internal.GenerateKey(rand) + return (*PublicKey)(pk), (*PrivateKey)(sk), err +} + +// NewKeyFromSeed derives a public/private key pair using the given seed. +func NewKeyFromSeed(seed *[SeedSize]byte) (*PublicKey, *PrivateKey) { + pk, sk := internal.NewKeyFromSeed(seed) + return (*PublicKey)(pk), (*PrivateKey)(sk) +} + +// SignTo signs the given message and writes the signature into signature. +// It will panic if signature is not of length at least SignatureSize. +// +// ctx is the optional context string. Errors if ctx is larger than 255 bytes. +// A nil context string is equivalent to an empty context string. +func SignTo(sk *PrivateKey, msg, ctx []byte, randomized bool, sig []byte) error { + var rnd [32]byte + if randomized { + _, err := cryptoRand.Read(rnd[:]) + if err != nil { + return err + } + } + + if len(ctx) > 255 { + return sign.ErrContextTooLong + } + + internal.SignTo( + (*internal.PrivateKey)(sk), + func(w io.Writer) { + _, _ = w.Write([]byte{0}) + _, _ = w.Write([]byte{byte(len(ctx))}) + + if ctx != nil { + _, _ = w.Write(ctx) + } + w.Write(msg) + }, + rnd, + sig, + ) + return nil +} + +// Do not use. Implements ML-DSA.Sign_internal used for compatibility tests. +func (sk *PrivateKey) unsafeSignInternal(msg []byte, rnd [32]byte) []byte { + var ret [SignatureSize]byte + internal.SignTo( + (*internal.PrivateKey)(sk), + func(w io.Writer) { + _, _ = w.Write(msg) + }, + rnd, + ret[:], + ) + return ret[:] +} + +// Do not use. Implements ML-DSA.Verify_internal used for compatibility tests. +func unsafeVerifyInternal(pk *PublicKey, msg, sig []byte) bool { + return internal.Verify( + (*internal.PublicKey)(pk), + func(w io.Writer) { + _, _ = w.Write(msg) + }, + sig, + ) +} + +// Verify checks whether the given signature by pk on msg is valid. +// +// ctx is the optional context string. Fails if ctx is larger than 255 bytes. +// A nil context string is equivalent to an empty context string. +func Verify(pk *PublicKey, msg, ctx, sig []byte) bool { + if len(ctx) > 255 { + return false + } + return internal.Verify( + (*internal.PublicKey)(pk), + func(w io.Writer) { + _, _ = w.Write([]byte{0}) + _, _ = w.Write([]byte{byte(len(ctx))}) + + if ctx != nil { + _, _ = w.Write(ctx) + } + _, _ = w.Write(msg) + }, + sig, + ) +} + +// Sets pk to the public key encoded in buf. +func (pk *PublicKey) Unpack(buf *[PublicKeySize]byte) { + (*internal.PublicKey)(pk).Unpack(buf) +} + +// Sets sk to the private key encoded in buf. +func (sk *PrivateKey) Unpack(buf *[PrivateKeySize]byte) { + (*internal.PrivateKey)(sk).Unpack(buf) +} + +// Packs the public key into buf. +func (pk *PublicKey) Pack(buf *[PublicKeySize]byte) { + (*internal.PublicKey)(pk).Pack(buf) +} + +// Packs the private key into buf. +func (sk *PrivateKey) Pack(buf *[PrivateKeySize]byte) { + (*internal.PrivateKey)(sk).Pack(buf) +} + +// Packs the public key. +func (pk *PublicKey) Bytes() []byte { + var buf [PublicKeySize]byte + pk.Pack(&buf) + return buf[:] +} + +// Packs the private key. +func (sk *PrivateKey) Bytes() []byte { + var buf [PrivateKeySize]byte + sk.Pack(&buf) + return buf[:] +} + +// Packs the public key. +func (pk *PublicKey) MarshalBinary() ([]byte, error) { + return pk.Bytes(), nil +} + +// Packs the private key. +func (sk *PrivateKey) MarshalBinary() ([]byte, error) { + return sk.Bytes(), nil +} + +// Unpacks the public key from data. +func (pk *PublicKey) UnmarshalBinary(data []byte) error { + if len(data) != PublicKeySize { + return errors.New("packed public key must be of mldsa65.PublicKeySize bytes") + } + var buf [PublicKeySize]byte + copy(buf[:], data) + pk.Unpack(&buf) + return nil +} + +// Unpacks the private key from data. +func (sk *PrivateKey) UnmarshalBinary(data []byte) error { + if len(data) != PrivateKeySize { + return errors.New("packed private key must be of mldsa65.PrivateKeySize bytes") + } + var buf [PrivateKeySize]byte + copy(buf[:], data) + sk.Unpack(&buf) + return nil +} + +// Sign signs the given message. +// +// opts.HashFunc() must return zero, which can be achieved by passing +// crypto.Hash(0) for opts. rand is ignored. Will only return an error +// if opts.HashFunc() is non-zero. +// +// This function is used to make PrivateKey implement the crypto.Signer +// interface. The package-level SignTo function might be more convenient +// to use. +func (sk *PrivateKey) Sign(rand io.Reader, msg []byte, opts crypto.SignerOpts) ( + sig []byte, err error) { + var ret [SignatureSize]byte + + if opts.HashFunc() != crypto.Hash(0) { + return nil, errors.New("dilithium: cannot sign hashed message") + } + if err = SignTo(sk, msg, nil, false, ret[:]); err != nil { + return nil, err + } + + return ret[:], nil +} + +// Computes the public key corresponding to this private key. +// +// Returns a *PublicKey. The type crypto.PublicKey is used to make +// PrivateKey implement the crypto.Signer interface. +func (sk *PrivateKey) Public() crypto.PublicKey { + return (*PublicKey)((*internal.PrivateKey)(sk).Public()) +} + +// Equal returns whether the two private keys equal. +func (sk *PrivateKey) Equal(other crypto.PrivateKey) bool { + castOther, ok := other.(*PrivateKey) + if !ok { + return false + } + return (*internal.PrivateKey)(sk).Equal((*internal.PrivateKey)(castOther)) +} + +// Equal returns whether the two public keys equal. +func (pk *PublicKey) Equal(other crypto.PublicKey) bool { + castOther, ok := other.(*PublicKey) + if !ok { + return false + } + return (*internal.PublicKey)(pk).Equal((*internal.PublicKey)(castOther)) +} + +// Boilerplate for generic signatures API + +type scheme struct{} + +var sch sign.Scheme = &scheme{} + +// Scheme returns a generic signature interface for ML-DSA-65. +func Scheme() sign.Scheme { return sch } + +func (*scheme) Name() string { return "ML-DSA-65" } +func (*scheme) PublicKeySize() int { return PublicKeySize } +func (*scheme) PrivateKeySize() int { return PrivateKeySize } +func (*scheme) SignatureSize() int { return SignatureSize } +func (*scheme) SeedSize() int { return SeedSize } + +// TODO TLSIdentifier() +func (*scheme) Oid() asn1.ObjectIdentifier { + return asn1.ObjectIdentifier{2, 16, 840, 1, 101, 3, 4, 18} +} + +func (*scheme) SupportsContext() bool { + return true +} + +func (*scheme) GenerateKey() (sign.PublicKey, sign.PrivateKey, error) { + return GenerateKey(nil) +} + +func (*scheme) Sign( + sk sign.PrivateKey, + msg []byte, + opts *sign.SignatureOpts, +) []byte { + var ctx []byte + sig := make([]byte, SignatureSize) + + priv, ok := sk.(*PrivateKey) + if !ok { + panic(sign.ErrTypeMismatch) + } + if opts != nil && opts.Context != "" { + ctx = []byte(opts.Context) + } + err := SignTo(priv, msg, ctx, false, sig) + if err != nil { + panic(err) + } + + return sig +} + +func (*scheme) Verify( + pk sign.PublicKey, + msg, sig []byte, + opts *sign.SignatureOpts, +) bool { + var ctx []byte + pub, ok := pk.(*PublicKey) + if !ok { + panic(sign.ErrTypeMismatch) + } + if opts != nil && opts.Context != "" { + ctx = []byte(opts.Context) + } + return Verify(pub, msg, ctx, sig) +} + +func (*scheme) DeriveKey(seed []byte) (sign.PublicKey, sign.PrivateKey) { + if len(seed) != SeedSize { + panic(sign.ErrSeedSize) + } + var seed2 [SeedSize]byte + copy(seed2[:], seed) + return NewKeyFromSeed(&seed2) +} + +func (*scheme) UnmarshalBinaryPublicKey(buf []byte) (sign.PublicKey, error) { + if len(buf) != PublicKeySize { + return nil, sign.ErrPubKeySize + } + + var ( + buf2 [PublicKeySize]byte + ret PublicKey + ) + + copy(buf2[:], buf) + ret.Unpack(&buf2) + return &ret, nil +} + +func (*scheme) UnmarshalBinaryPrivateKey(buf []byte) (sign.PrivateKey, error) { + if len(buf) != PrivateKeySize { + return nil, sign.ErrPrivKeySize + } + + var ( + buf2 [PrivateKeySize]byte + ret PrivateKey + ) + + copy(buf2[:], buf) + ret.Unpack(&buf2) + return &ret, nil +} + +func (sk *PrivateKey) Scheme() sign.Scheme { + return sch +} + +func (sk *PublicKey) Scheme() sign.Scheme { + return sch +} diff --git a/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/dilithium.go b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/dilithium.go new file mode 100644 index 0000000000..8f1c8e5cbf --- /dev/null +++ b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/dilithium.go @@ -0,0 +1,491 @@ +// Code generated from mode3/internal/dilithium.go by gen.go + +package internal + +import ( + cryptoRand "crypto/rand" + "crypto/subtle" + "io" + + "github.com/cloudflare/circl/internal/sha3" + common "github.com/cloudflare/circl/sign/internal/dilithium" +) + +const ( + // Size of a packed polynomial of norm ≤η. + // (Note that the formula is not valid in general.) + PolyLeqEtaSize = (common.N * DoubleEtaBits) / 8 + + // β = τη, the maximum size of c s₂. + Beta = Tau * Eta + + // γ₁ range of y + Gamma1 = 1 << Gamma1Bits + + // Size of packed polynomial of norm <γ₁ such as z + PolyLeGamma1Size = (Gamma1Bits + 1) * common.N / 8 + + // α = 2γ₂ parameter for decompose + Alpha = 2 * Gamma2 + + // Size of a packed private key + PrivateKeySize = 32 + 32 + TRSize + PolyLeqEtaSize*(L+K) + common.PolyT0Size*K + + // Size of a packed public key + PublicKeySize = 32 + common.PolyT1Size*K + + // Size of a packed signature + SignatureSize = L*PolyLeGamma1Size + Omega + K + CTildeSize + + // Size of packed w₁ + PolyW1Size = (common.N * (common.QBits - Gamma1Bits)) / 8 +) + +// PublicKey is the type of Dilithium public keys. +type PublicKey struct { + rho [32]byte + t1 VecK + + // Cached values + t1p [common.PolyT1Size * K]byte + A *Mat + tr *[TRSize]byte +} + +// PrivateKey is the type of Dilithium private keys. +type PrivateKey struct { + rho [32]byte + key [32]byte + s1 VecL + s2 VecK + t0 VecK + tr [TRSize]byte + + // Cached values + A Mat // ExpandA(ρ) + s1h VecL // NTT(s₁) + s2h VecK // NTT(s₂) + t0h VecK // NTT(t₀) +} + +type unpackedSignature struct { + z VecL + hint VecK + c [CTildeSize]byte +} + +// Packs the signature into buf. +func (sig *unpackedSignature) Pack(buf []byte) { + copy(buf[:], sig.c[:]) + sig.z.PackLeGamma1(buf[CTildeSize:]) + sig.hint.PackHint(buf[CTildeSize+L*PolyLeGamma1Size:]) +} + +// Sets sig to the signature encoded in the buffer. +// +// Returns whether buf contains a properly packed signature. +func (sig *unpackedSignature) Unpack(buf []byte) bool { + if len(buf) < SignatureSize { + return false + } + copy(sig.c[:], buf[:]) + sig.z.UnpackLeGamma1(buf[CTildeSize:]) + if sig.z.Exceeds(Gamma1 - Beta) { + return false + } + if !sig.hint.UnpackHint(buf[CTildeSize+L*PolyLeGamma1Size:]) { + return false + } + return true +} + +// Packs the public key into buf. +func (pk *PublicKey) Pack(buf *[PublicKeySize]byte) { + copy(buf[:32], pk.rho[:]) + copy(buf[32:], pk.t1p[:]) +} + +// Sets pk to the public key encoded in buf. +func (pk *PublicKey) Unpack(buf *[PublicKeySize]byte) { + copy(pk.rho[:], buf[:32]) + copy(pk.t1p[:], buf[32:]) + + pk.t1.UnpackT1(pk.t1p[:]) + pk.A = new(Mat) + pk.A.Derive(&pk.rho) + + // tr = CRH(ρ ‖ t1) = CRH(pk) + pk.tr = new([TRSize]byte) + h := sha3.NewShake256() + _, _ = h.Write(buf[:]) + _, _ = h.Read(pk.tr[:]) +} + +// Packs the private key into buf. +func (sk *PrivateKey) Pack(buf *[PrivateKeySize]byte) { + copy(buf[:32], sk.rho[:]) + copy(buf[32:64], sk.key[:]) + copy(buf[64:64+TRSize], sk.tr[:]) + offset := 64 + TRSize + sk.s1.PackLeqEta(buf[offset:]) + offset += PolyLeqEtaSize * L + sk.s2.PackLeqEta(buf[offset:]) + offset += PolyLeqEtaSize * K + sk.t0.PackT0(buf[offset:]) +} + +// Sets sk to the private key encoded in buf. +func (sk *PrivateKey) Unpack(buf *[PrivateKeySize]byte) { + copy(sk.rho[:], buf[:32]) + copy(sk.key[:], buf[32:64]) + copy(sk.tr[:], buf[64:64+TRSize]) + offset := 64 + TRSize + sk.s1.UnpackLeqEta(buf[offset:]) + offset += PolyLeqEtaSize * L + sk.s2.UnpackLeqEta(buf[offset:]) + offset += PolyLeqEtaSize * K + sk.t0.UnpackT0(buf[offset:]) + + // Cached values + sk.A.Derive(&sk.rho) + sk.t0h = sk.t0 + sk.t0h.NTT() + sk.s1h = sk.s1 + sk.s1h.NTT() + sk.s2h = sk.s2 + sk.s2h.NTT() +} + +// GenerateKey generates a public/private key pair using entropy from rand. +// If rand is nil, crypto/rand.Reader will be used. +func GenerateKey(rand io.Reader) (*PublicKey, *PrivateKey, error) { + var seed [32]byte + if rand == nil { + rand = cryptoRand.Reader + } + _, err := io.ReadFull(rand, seed[:]) + if err != nil { + return nil, nil, err + } + pk, sk := NewKeyFromSeed(&seed) + return pk, sk, nil +} + +// NewKeyFromSeed derives a public/private key pair using the given seed. +func NewKeyFromSeed(seed *[common.SeedSize]byte) (*PublicKey, *PrivateKey) { + var eSeed [128]byte // expanded seed + var pk PublicKey + var sk PrivateKey + var sSeed [64]byte + + h := sha3.NewShake256() + _, _ = h.Write(seed[:]) + + if NIST { + _, _ = h.Write([]byte{byte(K), byte(L)}) + } + + _, _ = h.Read(eSeed[:]) + + copy(pk.rho[:], eSeed[:32]) + copy(sSeed[:], eSeed[32:96]) + copy(sk.key[:], eSeed[96:]) + copy(sk.rho[:], pk.rho[:]) + + sk.A.Derive(&pk.rho) + + for i := uint16(0); i < L; i++ { + PolyDeriveUniformLeqEta(&sk.s1[i], &sSeed, i) + } + + for i := uint16(0); i < K; i++ { + PolyDeriveUniformLeqEta(&sk.s2[i], &sSeed, i+L) + } + + sk.s1h = sk.s1 + sk.s1h.NTT() + sk.s2h = sk.s2 + sk.s2h.NTT() + + sk.computeT0andT1(&sk.t0, &pk.t1) + + sk.t0h = sk.t0 + sk.t0h.NTT() + + // Complete public key far enough to be packed + pk.t1.PackT1(pk.t1p[:]) + pk.A = &sk.A + + // Finish private key + var packedPk [PublicKeySize]byte + pk.Pack(&packedPk) + + // tr = CRH(ρ ‖ t1) = CRH(pk) + h.Reset() + _, _ = h.Write(packedPk[:]) + _, _ = h.Read(sk.tr[:]) + + // Finish cache of public key + pk.tr = &sk.tr + + return &pk, &sk +} + +// Computes t0 and t1 from sk.s1h, sk.s2 and sk.A. +func (sk *PrivateKey) computeT0andT1(t0, t1 *VecK) { + var t VecK + + // Set t to A s₁ + s₂ + for i := 0; i < K; i++ { + PolyDotHat(&t[i], &sk.A[i], &sk.s1h) + t[i].ReduceLe2Q() + t[i].InvNTT() + } + t.Add(&t, &sk.s2) + t.Normalize() + + // Compute t₀, t₁ = Power2Round(t) + t.Power2Round(t0, t1) +} + +// Verify checks whether the given signature by pk on msg is valid. +// +// For Dilithium this is the top-level verification function. +// In ML-DSA, this is ML-DSA.Verify_internal. +func Verify(pk *PublicKey, msg func(io.Writer), signature []byte) bool { + var sig unpackedSignature + var mu [64]byte + var zh VecL + var Az, Az2dct1, w1 VecK + var ch common.Poly + var cp [CTildeSize]byte + var w1Packed [PolyW1Size * K]byte + + // Note that Unpack() checked whether ‖z‖_∞ < γ₁ - β + // and ensured that there at most ω ones in pk.hint. + if !sig.Unpack(signature) { + return false + } + + // μ = CRH(tr ‖ msg) + h := sha3.NewShake256() + _, _ = h.Write(pk.tr[:]) + msg(&h) + _, _ = h.Read(mu[:]) + + // Compute Az + zh = sig.z + zh.NTT() + + for i := 0; i < K; i++ { + PolyDotHat(&Az[i], &pk.A[i], &zh) + } + + // Next, we compute Az - 2ᵈ·c·t₁. + // Note that the coefficients of t₁ are bounded by 256 = 2⁹, + // so the coefficients of Az2dct1 will bounded by 2⁹⁺ᵈ = 2²³ < 2q, + // which is small enough for NTT(). + Az2dct1.MulBy2toD(&pk.t1) + Az2dct1.NTT() + PolyDeriveUniformBall(&ch, sig.c[:]) + ch.NTT() + for i := 0; i < K; i++ { + Az2dct1[i].MulHat(&Az2dct1[i], &ch) + } + Az2dct1.Sub(&Az, &Az2dct1) + Az2dct1.ReduceLe2Q() + Az2dct1.InvNTT() + Az2dct1.NormalizeAssumingLe2Q() + + // UseHint(pk.hint, Az - 2ᵈ·c·t₁) + // = UseHint(pk.hint, w - c·s₂ + c·t₀) + // = UseHint(pk.hint, r + c·t₀) + // = r₁ = w₁. + w1.UseHint(&Az2dct1, &sig.hint) + w1.PackW1(w1Packed[:]) + + // c' = H(μ, w₁) + h.Reset() + _, _ = h.Write(mu[:]) + _, _ = h.Write(w1Packed[:]) + _, _ = h.Read(cp[:]) + + return sig.c == cp +} + +// SignTo signs the given message and writes the signature into signature. +// +// For Dilithium this is the top-level signing function. For ML-DSA +// this is ML-DSA.Sign_internal. +// +//nolint:funlen +func SignTo(sk *PrivateKey, msg func(io.Writer), rnd [32]byte, signature []byte) { + var mu, rhop [64]byte + var w1Packed [PolyW1Size * K]byte + var y, yh VecL + var w, w0, w1, w0mcs2, ct0, w0mcs2pct0 VecK + var ch common.Poly + var yNonce uint16 + var sig unpackedSignature + + if len(signature) < SignatureSize { + panic("Signature does not fit in that byteslice") + } + + // μ = CRH(tr ‖ msg) + h := sha3.NewShake256() + _, _ = h.Write(sk.tr[:]) + msg(&h) + _, _ = h.Read(mu[:]) + + // ρ' = CRH(key ‖ μ) + h.Reset() + _, _ = h.Write(sk.key[:]) + if NIST { + _, _ = h.Write(rnd[:]) + } + _, _ = h.Write(mu[:]) + _, _ = h.Read(rhop[:]) + + // Main rejection loop + attempt := 0 + for { + attempt++ + if attempt >= 576 { + // Depending on the mode, one try has a chance between 1/7 and 1/4 + // of succeeding. Thus it is safe to say that 576 iterations + // are enough as (6/7)⁵⁷⁶ < 2⁻¹²⁸. + panic("This should only happen 1 in 2^{128}: something is wrong.") + } + + // y = ExpandMask(ρ', key) + VecLDeriveUniformLeGamma1(&y, &rhop, yNonce) + yNonce += uint16(L) + + // Set w to A y + yh = y + yh.NTT() + for i := 0; i < K; i++ { + PolyDotHat(&w[i], &sk.A[i], &yh) + w[i].ReduceLe2Q() + w[i].InvNTT() + } + + // Decompose w into w₀ and w₁ + w.NormalizeAssumingLe2Q() + w.Decompose(&w0, &w1) + + // c~ = H(μ ‖ w₁) + w1.PackW1(w1Packed[:]) + h.Reset() + _, _ = h.Write(mu[:]) + _, _ = h.Write(w1Packed[:]) + _, _ = h.Read(sig.c[:]) + + PolyDeriveUniformBall(&ch, sig.c[:]) + ch.NTT() + + // Ensure ‖ w₀ - c·s2 ‖_∞ < γ₂ - β. + // + // By Lemma 3 of the specification this is equivalent to checking that + // both ‖ r₀ ‖_∞ < γ₂ - β and r₁ = w₁, for the decomposition + // w - c·s₂ = r₁ α + r₀ as computed by decompose(). + // See also §4.1 of the specification. + for i := 0; i < K; i++ { + w0mcs2[i].MulHat(&ch, &sk.s2h[i]) + w0mcs2[i].InvNTT() + } + w0mcs2.Sub(&w0, &w0mcs2) + w0mcs2.Normalize() + + if w0mcs2.Exceeds(Gamma2 - Beta) { + continue + } + + // z = y + c·s₁ + for i := 0; i < L; i++ { + sig.z[i].MulHat(&ch, &sk.s1h[i]) + sig.z[i].InvNTT() + } + sig.z.Add(&sig.z, &y) + sig.z.Normalize() + + // Ensure ‖z‖_∞ < γ₁ - β + if sig.z.Exceeds(Gamma1 - Beta) { + continue + } + + // Compute c·t₀ + for i := 0; i < K; i++ { + ct0[i].MulHat(&ch, &sk.t0h[i]) + ct0[i].InvNTT() + } + ct0.NormalizeAssumingLe2Q() + + // Ensure ‖c·t₀‖_∞ < γ₂. + if ct0.Exceeds(Gamma2) { + continue + } + + // Create the hint to be able to reconstruct w₁ from w - c·s₂ + c·t0. + // Note that we're not using makeHint() in the obvious way as we + // do not know whether ‖ sc·s₂ - c·t₀ ‖_∞ < γ₂. Instead we note + // that our makeHint() is actually the same as a makeHint for a + // different decomposition: + // + // Earlier we ensured indirectly with a check that r₁ = w₁ where + // r = w - c·s₂. Hence r₀ = r - r₁ α = w - c·s₂ - w₁ α = w₀ - c·s₂. + // Thus MakeHint(w₀ - c·s₂ + c·t₀, w₁) = MakeHint(r0 + c·t₀, r₁) + // and UseHint(w - c·s₂ + c·t₀, w₁) = UseHint(r + c·t₀, r₁). + // As we just ensured that ‖ c·t₀ ‖_∞ < γ₂ our usage is correct. + w0mcs2pct0.Add(&w0mcs2, &ct0) + w0mcs2pct0.NormalizeAssumingLe2Q() + hintPop := sig.hint.MakeHint(&w0mcs2pct0, &w1) + if hintPop > Omega { + continue + } + + break + } + + sig.Pack(signature[:]) +} + +// Computes the public key corresponding to this private key. +func (sk *PrivateKey) Public() *PublicKey { + var t0 VecK + pk := &PublicKey{ + rho: sk.rho, + A: &sk.A, + tr: &sk.tr, + } + sk.computeT0andT1(&t0, &pk.t1) + pk.t1.PackT1(pk.t1p[:]) + return pk +} + +// Equal returns whether the two public keys are equal +func (pk *PublicKey) Equal(other *PublicKey) bool { + return pk.rho == other.rho && pk.t1 == other.t1 +} + +// Equal returns whether the two private keys are equal +func (sk *PrivateKey) Equal(other *PrivateKey) bool { + ret := (subtle.ConstantTimeCompare(sk.rho[:], other.rho[:]) & + subtle.ConstantTimeCompare(sk.key[:], other.key[:]) & + subtle.ConstantTimeCompare(sk.tr[:], other.tr[:])) + + acc := uint32(0) + for i := 0; i < L; i++ { + for j := 0; j < common.N; j++ { + acc |= sk.s1[i][j] ^ other.s1[i][j] + } + } + for i := 0; i < K; i++ { + for j := 0; j < common.N; j++ { + acc |= sk.s2[i][j] ^ other.s2[i][j] + acc |= sk.t0[i][j] ^ other.t0[i][j] + } + } + return (ret & subtle.ConstantTimeEq(int32(acc), 0)) == 1 +} diff --git a/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/mat.go b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/mat.go new file mode 100644 index 0000000000..ceaf634fa7 --- /dev/null +++ b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/mat.go @@ -0,0 +1,59 @@ +// Code generated from mode3/internal/mat.go by gen.go + +package internal + +import ( + common "github.com/cloudflare/circl/sign/internal/dilithium" +) + +// A k by l matrix of polynomials. +type Mat [K]VecL + +// Expands the given seed to a complete matrix. +// +// This function is called ExpandA in the specification. +func (m *Mat) Derive(seed *[32]byte) { + if !DeriveX4Available { + for i := uint16(0); i < K; i++ { + for j := uint16(0); j < L; j++ { + PolyDeriveUniform(&m[i][j], seed, (i<<8)+j) + } + } + return + } + + idx := 0 + var nonces [4]uint16 + var ps [4]*common.Poly + for i := uint16(0); i < K; i++ { + for j := uint16(0); j < L; j++ { + nonces[idx] = (i << 8) + j + ps[idx] = &m[i][j] + idx++ + if idx == 4 { + idx = 0 + PolyDeriveUniformX4(ps, seed, nonces) + } + } + } + if idx != 0 { + for i := idx; i < 4; i++ { + ps[i] = nil + } + PolyDeriveUniformX4(ps, seed, nonces) + } +} + +// Set p to the inner product of a and b using pointwise multiplication. +// +// Assumes a and b are in Montgomery form and their coefficients are +// pairwise sufficiently small to multiply, see Poly.MulHat(). Resulting +// coefficients are bounded by 2Lq. +func PolyDotHat(p *common.Poly, a, b *VecL) { + var t common.Poly + *p = common.Poly{} // zero p + for i := 0; i < L; i++ { + t.MulHat(&a[i], &b[i]) + p.Add(&t, p) + } +} diff --git a/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/pack.go b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/pack.go new file mode 100644 index 0000000000..1854b41973 --- /dev/null +++ b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/pack.go @@ -0,0 +1,270 @@ +// Code generated from mode3/internal/pack.go by gen.go + +package internal + +import ( + common "github.com/cloudflare/circl/sign/internal/dilithium" +) + +// Writes p with norm less than or equal η into buf, which must be of +// size PolyLeqEtaSize. +// +// Assumes coefficients of p are not normalized, but in [q-η,q+η]. +func PolyPackLeqEta(p *common.Poly, buf []byte) { + if DoubleEtaBits == 4 { // compiler eliminates branch + j := 0 + for i := 0; i < PolyLeqEtaSize; i++ { + buf[i] = (byte(common.Q+Eta-p[j]) | + byte(common.Q+Eta-p[j+1])<<4) + j += 2 + } + } else if DoubleEtaBits == 3 { + j := 0 + for i := 0; i < PolyLeqEtaSize; i += 3 { + buf[i] = (byte(common.Q+Eta-p[j]) | + (byte(common.Q+Eta-p[j+1]) << 3) | + (byte(common.Q+Eta-p[j+2]) << 6)) + buf[i+1] = ((byte(common.Q+Eta-p[j+2]) >> 2) | + (byte(common.Q+Eta-p[j+3]) << 1) | + (byte(common.Q+Eta-p[j+4]) << 4) | + (byte(common.Q+Eta-p[j+5]) << 7)) + buf[i+2] = ((byte(common.Q+Eta-p[j+5]) >> 1) | + (byte(common.Q+Eta-p[j+6]) << 2) | + (byte(common.Q+Eta-p[j+7]) << 5)) + j += 8 + } + } else { + panic("eta not supported") + } +} + +// Sets p to the polynomial of norm less than or equal η encoded in the +// given buffer of size PolyLeqEtaSize. +// +// Output coefficients of p are not normalized, but in [q-η,q+η] provided +// buf was created using PackLeqEta. +// +// Beware, for arbitrary buf the coefficients of p might end up in +// the interval [q-2^b,q+2^b] where b is the least b with η≤2^b. +func PolyUnpackLeqEta(p *common.Poly, buf []byte) { + if DoubleEtaBits == 4 { // compiler eliminates branch + j := 0 + for i := 0; i < PolyLeqEtaSize; i++ { + p[j] = common.Q + Eta - uint32(buf[i]&15) + p[j+1] = common.Q + Eta - uint32(buf[i]>>4) + j += 2 + } + } else if DoubleEtaBits == 3 { + j := 0 + for i := 0; i < PolyLeqEtaSize; i += 3 { + p[j] = common.Q + Eta - uint32(buf[i]&7) + p[j+1] = common.Q + Eta - uint32((buf[i]>>3)&7) + p[j+2] = common.Q + Eta - uint32((buf[i]>>6)|((buf[i+1]<<2)&7)) + p[j+3] = common.Q + Eta - uint32((buf[i+1]>>1)&7) + p[j+4] = common.Q + Eta - uint32((buf[i+1]>>4)&7) + p[j+5] = common.Q + Eta - uint32((buf[i+1]>>7)|((buf[i+2]<<1)&7)) + p[j+6] = common.Q + Eta - uint32((buf[i+2]>>2)&7) + p[j+7] = common.Q + Eta - uint32((buf[i+2]>>5)&7) + j += 8 + } + } else { + panic("eta not supported") + } +} + +// Writes v with coefficients in {0, 1} of which at most ω non-zero +// to buf, which must have length ω+k. +func (v *VecK) PackHint(buf []byte) { + // The packed hint starts with the indices of the non-zero coefficients + // For instance: + // + // (x⁵⁶ + x¹⁰⁰, x²⁵⁵, 0, x² + x²³, x¹) + // + // Yields + // + // 56, 100, 255, 2, 23, 1 + // + // Then we pad with zeroes until we have a list of ω items: + // // 56, 100, 255, 2, 23, 1, 0, 0, ..., 0 + // + // Then we finish with a list of the switch-over-indices in this + // list between polynomials, so: + // + // 56, 100, 255, 2, 23, 1, 0, 0, ..., 0, 2, 3, 3, 5, 6 + + off := uint8(0) + for i := 0; i < K; i++ { + for j := uint16(0); j < common.N; j++ { + if v[i][j] != 0 { + buf[off] = uint8(j) + off++ + } + } + buf[Omega+i] = off + } + for ; off < Omega; off++ { + buf[off] = 0 + } +} + +// Sets v to the vector encoded using VecK.PackHint() +// +// Returns whether unpacking was successful. +func (v *VecK) UnpackHint(buf []byte) bool { + // A priori, there would be several reasonable ways to encode the same + // hint vector. We take care to only allow only one encoding, to ensure + // "strong unforgeability". + // + // See PackHint() source for description of the encoding. + *v = VecK{} // zero v + prevSOP := uint8(0) // previous switch-over-point + for i := 0; i < K; i++ { + SOP := buf[Omega+i] + if SOP < prevSOP || SOP > Omega { + return false // ensures switch-over-points are increasing + } + for j := prevSOP; j < SOP; j++ { + if j > prevSOP && buf[j] <= buf[j-1] { + return false // ensures indices are increasing (within a poly) + } + v[i][buf[j]] = 1 + } + prevSOP = SOP + } + for j := prevSOP; j < Omega; j++ { + if buf[j] != 0 { + return false // ensures padding indices are zero + } + } + + return true +} + +// Sets p to the polynomial packed into buf by PolyPackLeGamma1. +// +// p will be normalized. +func PolyUnpackLeGamma1(p *common.Poly, buf []byte) { + if Gamma1Bits == 17 { + j := 0 + for i := 0; i < PolyLeGamma1Size; i += 9 { + p0 := uint32(buf[i]) | (uint32(buf[i+1]) << 8) | + (uint32(buf[i+2]&0x3) << 16) + p1 := uint32(buf[i+2]>>2) | (uint32(buf[i+3]) << 6) | + (uint32(buf[i+4]&0xf) << 14) + p2 := uint32(buf[i+4]>>4) | (uint32(buf[i+5]) << 4) | + (uint32(buf[i+6]&0x3f) << 12) + p3 := uint32(buf[i+6]>>6) | (uint32(buf[i+7]) << 2) | + (uint32(buf[i+8]) << 10) + + // coefficients in [0,…,2γ₁) + p0 = Gamma1 - p0 // (-γ₁,…,γ₁] + p1 = Gamma1 - p1 + p2 = Gamma1 - p2 + p3 = Gamma1 - p3 + + p0 += uint32(int32(p0)>>31) & common.Q // normalize + p1 += uint32(int32(p1)>>31) & common.Q + p2 += uint32(int32(p2)>>31) & common.Q + p3 += uint32(int32(p3)>>31) & common.Q + + p[j] = p0 + p[j+1] = p1 + p[j+2] = p2 + p[j+3] = p3 + + j += 4 + } + } else if Gamma1Bits == 19 { + j := 0 + for i := 0; i < PolyLeGamma1Size; i += 5 { + p0 := uint32(buf[i]) | (uint32(buf[i+1]) << 8) | + (uint32(buf[i+2]&0xf) << 16) + p1 := uint32(buf[i+2]>>4) | (uint32(buf[i+3]) << 4) | + (uint32(buf[i+4]) << 12) + + p0 = Gamma1 - p0 + p1 = Gamma1 - p1 + + p0 += uint32(int32(p0)>>31) & common.Q + p1 += uint32(int32(p1)>>31) & common.Q + + p[j] = p0 + p[j+1] = p1 + + j += 2 + } + } else { + panic("γ₁ not supported") + } +} + +// Writes p whose coefficients are in (-γ₁,γ₁] into buf +// which has to be of length PolyLeGamma1Size. +// +// Assumes p is normalized. +func PolyPackLeGamma1(p *common.Poly, buf []byte) { + if Gamma1Bits == 17 { + j := 0 + // coefficients in [0,…,γ₁] ∪ (q-γ₁,…,q) + for i := 0; i < PolyLeGamma1Size; i += 9 { + p0 := Gamma1 - p[j] // [0,…,γ₁] ∪ (γ₁-q,…,2γ₁-q) + p0 += uint32(int32(p0)>>31) & common.Q // [0,…,2γ₁) + p1 := Gamma1 - p[j+1] + p1 += uint32(int32(p1)>>31) & common.Q + p2 := Gamma1 - p[j+2] + p2 += uint32(int32(p2)>>31) & common.Q + p3 := Gamma1 - p[j+3] + p3 += uint32(int32(p3)>>31) & common.Q + + buf[i+0] = byte(p0) + buf[i+1] = byte(p0 >> 8) + buf[i+2] = byte(p0>>16) | byte(p1<<2) + buf[i+3] = byte(p1 >> 6) + buf[i+4] = byte(p1>>14) | byte(p2<<4) + buf[i+5] = byte(p2 >> 4) + buf[i+6] = byte(p2>>12) | byte(p3<<6) + buf[i+7] = byte(p3 >> 2) + buf[i+8] = byte(p3 >> 10) + + j += 4 + } + } else if Gamma1Bits == 19 { + j := 0 + for i := 0; i < PolyLeGamma1Size; i += 5 { + // Coefficients are in [0, γ₁] ∪ (Q-γ₁, Q) + p0 := Gamma1 - p[j] + p0 += uint32(int32(p0)>>31) & common.Q + p1 := Gamma1 - p[j+1] + p1 += uint32(int32(p1)>>31) & common.Q + + buf[i+0] = byte(p0) + buf[i+1] = byte(p0 >> 8) + buf[i+2] = byte(p0>>16) | byte(p1<<4) + buf[i+3] = byte(p1 >> 4) + buf[i+4] = byte(p1 >> 12) + + j += 2 + } + } else { + panic("γ₁ not supported") + } +} + +// Pack w₁ into buf, which must be of length PolyW1Size. +// +// Assumes w₁ is normalized. +func PolyPackW1(p *common.Poly, buf []byte) { + if Gamma1Bits == 19 { + p.PackLe16(buf) + } else if Gamma1Bits == 17 { + j := 0 + for i := 0; i < PolyW1Size; i += 3 { + buf[i] = byte(p[j]) | byte(p[j+1]<<6) + buf[i+1] = byte(p[j+1]>>2) | byte(p[j+2]<<4) + buf[i+2] = byte(p[j+2]>>4) | byte(p[j+3]<<2) + j += 4 + } + } else { + panic("unsupported γ₁") + } +} diff --git a/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/params.go b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/params.go new file mode 100644 index 0000000000..8a1f866e65 --- /dev/null +++ b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/params.go @@ -0,0 +1,18 @@ +// Code generated from params.templ.go. DO NOT EDIT. + +package internal + +const ( + Name = "ML-DSA-65" + K = 6 + L = 5 + Eta = 4 + DoubleEtaBits = 4 + Omega = 55 + Tau = 49 + Gamma1Bits = 19 + Gamma2 = 261888 + NIST = true + TRSize = 64 + CTildeSize = 48 +) diff --git a/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/rounding.go b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/rounding.go new file mode 100644 index 0000000000..58123c090b --- /dev/null +++ b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/rounding.go @@ -0,0 +1,142 @@ +// Code generated from mode3/internal/rounding.go by gen.go + +package internal + +import ( + common "github.com/cloudflare/circl/sign/internal/dilithium" +) + +// Splits 0 ≤ a < q into a₀ and a₁ with a = a₁*α + a₀ with -α/2 < a₀ ≤ α/2, +// except for when we would have a₁ = (q-1)/α in which case a₁=0 is taken +// and -α/2 ≤ a₀ < 0. Returns a₀ + q. Note 0 ≤ a₁ < (q-1)/α. +// Recall α = 2γ₂. +func decompose(a uint32) (a0plusQ, a1 uint32) { + // a₁ = ⌈a / 128⌉ + a1 = (a + 127) >> 7 + + if Alpha == 523776 { + // 1025/2²² is close enough to 1/4092 so that a₁ + // becomes a/α rounded down. + a1 = ((a1*1025 + (1 << 21)) >> 22) + + // For the corner-case a₁ = (q-1)/α = 16, we have to set a₁=0. + a1 &= 15 + } else if Alpha == 190464 { + // 1488/2²⁴ is close enough to 1/1488 so that a₁ + // becomes a/α rounded down. + a1 = ((a1 * 11275) + (1 << 23)) >> 24 + + // For the corner-case a₁ = (q-1)/α = 44, we have to set a₁=0. + a1 ^= uint32(int32(43-a1)>>31) & a1 + } else { + panic("unsupported α") + } + + a0plusQ = a - a1*Alpha + + // In the corner-case, when we set a₁=0, we will incorrectly + // have a₀ > (q-1)/2 and we'll need to subtract q. As we + // return a₀ + q, that comes down to adding q if a₀ < (q-1)/2. + a0plusQ += uint32(int32(a0plusQ-(common.Q-1)/2)>>31) & common.Q + + return +} + +// Assume 0 ≤ r, f < Q with ‖f‖_∞ ≤ α/2. Decompose r as r = r1*α + r0 as +// computed by decompose(). Write r' := r - f (mod Q). Now, decompose +// r'=r-f again as r' = r'1*α + r'0 using decompose(). As f is small, we +// have r'1 = r1 + h, where h ∈ {-1, 0, 1}. makeHint() computes |h| +// given z0 := r0 - f (mod Q) and r1. With |h|, which is called the hint, +// we can reconstruct r1 using only r' = r - f, which is done by useHint(). +// To wit: +// +// useHint( r - f, makeHint( r0 - f, r1 ) ) = r1. +// +// Assumes 0 ≤ z0 < Q. +func makeHint(z0, r1 uint32) uint32 { + // If -α/2 < r0 - f ≤ α/2, then r1*α + r0 - f is a valid decomposition of r' + // with the restrictions of decompose() and so r'1 = r1. So the hint + // should be 0. This is covered by the first two inequalities. + // There is one other case: if r0 - f = -α/2, then r1*α + r0 - f is also + // a valid decomposition if r1 = 0. In the other cases a one is carried + // and the hint should be 1. + if z0 <= Gamma2 || z0 > common.Q-Gamma2 || (z0 == common.Q-Gamma2 && r1 == 0) { + return 0 + } + return 1 +} + +// Uses the hint created by makeHint() to reconstruct r1 from r'=r-f; see +// documentation of makeHint() for context. +// Assumes 0 ≤ r' < Q. +func useHint(rp uint32, hint uint32) uint32 { + rp0plusQ, rp1 := decompose(rp) + if hint == 0 { + return rp1 + } + if rp0plusQ > common.Q { + return (rp1 + 1) & 15 + } + return (rp1 - 1) & 15 +} + +// Sets p to the hint polynomial for p0 the modified low bits and p1 +// the unmodified high bits --- see makeHint(). +// +// Returns the number of ones in the hint polynomial. +func PolyMakeHint(p, p0, p1 *common.Poly) (pop uint32) { + for i := 0; i < common.N; i++ { + h := makeHint(p0[i], p1[i]) + pop += h + p[i] = h + } + return +} + +// Computes corrections to the high bits of the polynomial q according +// to the hints in h and sets p to the corrected high bits. Returns p. +func PolyUseHint(p, q, hint *common.Poly) { + var q0PlusQ common.Poly + + // See useHint() and makeHint() for an explanation. We reimplement it + // here so that we can call Poly.Decompose(), which might be way faster + // than calling decompose() in a loop (for instance when having AVX2.) + + PolyDecompose(q, &q0PlusQ, p) + + for i := 0; i < common.N; i++ { + if hint[i] == 0 { + continue + } + if Gamma2 == 261888 { + if q0PlusQ[i] > common.Q { + p[i] = (p[i] + 1) & 15 + } else { + p[i] = (p[i] - 1) & 15 + } + } else if Gamma2 == 95232 { + if q0PlusQ[i] > common.Q { + if p[i] == 43 { + p[i] = 0 + } else { + p[i]++ + } + } else { + if p[i] == 0 { + p[i] = 43 + } else { + p[i]-- + } + } + } else { + panic("unsupported γ₂") + } + } +} + +// Splits each of the coefficients of p using decompose. +func PolyDecompose(p, p0PlusQ, p1 *common.Poly) { + for i := 0; i < common.N; i++ { + p0PlusQ[i], p1[i] = decompose(p[i]) + } +} diff --git a/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/sample.go b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/sample.go new file mode 100644 index 0000000000..b37370a4ec --- /dev/null +++ b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/sample.go @@ -0,0 +1,339 @@ +// Code generated from mode3/internal/sample.go by gen.go + +package internal + +import ( + "encoding/binary" + + "github.com/cloudflare/circl/internal/sha3" + common "github.com/cloudflare/circl/sign/internal/dilithium" + "github.com/cloudflare/circl/simd/keccakf1600" +) + +// DeriveX4Available indicates whether the system supports the quick fourway +// sampling variants like PolyDeriveUniformX4. +var DeriveX4Available = keccakf1600.IsEnabledX4() + +// For each i, sample ps[i] uniformly from the given seed and nonces[i]. +// ps[i] may be nil and is ignored in that case. +// +// Can only be called when DeriveX4Available is true. +func PolyDeriveUniformX4(ps [4]*common.Poly, seed *[32]byte, nonces [4]uint16) { + var perm keccakf1600.StateX4 + state := perm.Initialize(false) + + // Absorb the seed in the four states + for i := 0; i < 4; i++ { + v := binary.LittleEndian.Uint64(seed[8*i : 8*(i+1)]) + for j := 0; j < 4; j++ { + state[i*4+j] = v + } + } + + // Absorb the nonces, the SHAKE128 domain separator (0b1111), the + // start of the padding (0b...001) and the end of the padding 0b100... + // Recall that the rate of SHAKE128 is 168 --- i.e. 21 uint64s. + for j := 0; j < 4; j++ { + state[4*4+j] = uint64(nonces[j]) | (0x1f << 16) + state[20*4+j] = 0x80 << 56 + } + + var idx [4]int // indices into ps + for j := 0; j < 4; j++ { + if ps[j] == nil { + idx[j] = common.N // mark nil polynomial as completed + } + } + + done := false + for !done { + // Applies KeccaK-f[1600] to state to get the next 21 uint64s of each + // of the four SHAKE128 streams. + perm.Permute() + + done = true + + PolyLoop: + for j := 0; j < 4; j++ { + if idx[j] == common.N { + continue + } + for i := 0; i < 7; i++ { + var t [8]uint32 + t[0] = uint32(state[i*3*4+j] & 0x7fffff) + t[1] = uint32((state[i*3*4+j] >> 24) & 0x7fffff) + t[2] = uint32((state[i*3*4+j] >> 48) | + ((state[(i*3+1)*4+j] & 0x7f) << 16)) + t[3] = uint32((state[(i*3+1)*4+j] >> 8) & 0x7fffff) + t[4] = uint32((state[(i*3+1)*4+j] >> 32) & 0x7fffff) + t[5] = uint32((state[(i*3+1)*4+j] >> 56) | + ((state[(i*3+2)*4+j] & 0x7fff) << 8)) + t[6] = uint32((state[(i*3+2)*4+j] >> 16) & 0x7fffff) + t[7] = uint32((state[(i*3+2)*4+j] >> 40) & 0x7fffff) + + for k := 0; k < 8; k++ { + if t[k] < common.Q { + ps[j][idx[j]] = t[k] + idx[j]++ + if idx[j] == common.N { + continue PolyLoop + } + } + } + } + done = false + } + } +} + +// Sample p uniformly from the given seed and nonce. +// +// p will be normalized. +func PolyDeriveUniform(p *common.Poly, seed *[32]byte, nonce uint16) { + var i, length int + var buf [12 * 16]byte // fits 168B SHAKE-128 rate + + length = 168 + + sample := func() { + // Note that 3 divides into 168 and 12*16, so we use up buf completely. + for j := 0; j < length && i < common.N; j += 3 { + t := (uint32(buf[j]) | (uint32(buf[j+1]) << 8) | + (uint32(buf[j+2]) << 16)) & 0x7fffff + + // We use rejection sampling + if t < common.Q { + p[i] = t + i++ + } + } + } + + var iv [32 + 2]byte // 32 byte seed + uint16 nonce + h := sha3.NewShake128() + copy(iv[:32], seed[:]) + iv[32] = uint8(nonce) + iv[33] = uint8(nonce >> 8) + _, _ = h.Write(iv[:]) + + for i < common.N { + _, _ = h.Read(buf[:168]) + sample() + } +} + +// Sample p uniformly with coefficients of norm less than or equal η, +// using the given seed and nonce. +// +// p will not be normalized, but will have coefficients in [q-η,q+η]. +func PolyDeriveUniformLeqEta(p *common.Poly, seed *[64]byte, nonce uint16) { + // Assumes 2 < η < 8. + var i, length int + var buf [9 * 16]byte // fits 136B SHAKE-256 rate + + length = 136 + + sample := func() { + // We use rejection sampling + for j := 0; j < length && i < common.N; j++ { + t1 := uint32(buf[j]) & 15 + t2 := uint32(buf[j]) >> 4 + if Eta == 2 { // branch is eliminated by compiler + if t1 <= 14 { + t1 -= ((205 * t1) >> 10) * 5 // reduce mod 5 + p[i] = common.Q + Eta - t1 + i++ + } + if t2 <= 14 && i < common.N { + t2 -= ((205 * t2) >> 10) * 5 // reduce mod 5 + p[i] = common.Q + Eta - t2 + i++ + } + } else if Eta == 4 { + if t1 <= 2*Eta { + p[i] = common.Q + Eta - t1 + i++ + } + if t2 <= 2*Eta && i < common.N { + p[i] = common.Q + Eta - t2 + i++ + } + } else { + panic("unsupported η") + } + } + } + + var iv [64 + 2]byte // 64 byte seed + uint16 nonce + + h := sha3.NewShake256() + copy(iv[:64], seed[:]) + iv[64] = uint8(nonce) + iv[65] = uint8(nonce >> 8) + + // 136 is SHAKE-256 rate + _, _ = h.Write(iv[:]) + + for i < common.N { + _, _ = h.Read(buf[:136]) + sample() + } +} + +// Sample v[i] uniformly with coefficients in (-γ₁,…,γ₁] using the +// given seed and nonce+i +// +// p will be normalized. +func VecLDeriveUniformLeGamma1(v *VecL, seed *[64]byte, nonce uint16) { + for i := 0; i < L; i++ { + PolyDeriveUniformLeGamma1(&v[i], seed, nonce+uint16(i)) + } +} + +// Sample p uniformly with coefficients in (-γ₁,…,γK1s] using the +// given seed and nonce. +// +// p will be normalized. +func PolyDeriveUniformLeGamma1(p *common.Poly, seed *[64]byte, nonce uint16) { + var buf [PolyLeGamma1Size]byte + + var iv [66]byte + h := sha3.NewShake256() + copy(iv[:64], seed[:]) + iv[64] = uint8(nonce) + iv[65] = uint8(nonce >> 8) + _, _ = h.Write(iv[:]) + _, _ = h.Read(buf[:]) + + PolyUnpackLeGamma1(p, buf[:]) +} + +// For each i, sample ps[i] uniformly with τ non-zero coefficients in {q-1,1} +// using the given seed and w1[i]. ps[i] may be nil and is ignored +// in that case. ps[i] will be normalized. +// +// Can only be called when DeriveX4Available is true. +// +// This function is currently not used (yet). +func PolyDeriveUniformBallX4(ps [4]*common.Poly, seed []byte) { + var perm keccakf1600.StateX4 + state := perm.Initialize(false) + + // Absorb the seed in the four states + for i := 0; i < CTildeSize/8; i++ { + v := binary.LittleEndian.Uint64(seed[8*i : 8*(i+1)]) + for j := 0; j < 4; j++ { + state[i*4+j] = v + } + } + + // SHAKE256 domain separator and padding + for j := 0; j < 4; j++ { + state[(CTildeSize/8)*4+j] ^= 0x1f + state[16*4+j] ^= 0x80 << 56 + } + perm.Permute() + + var signs [4]uint64 + var idx [4]uint16 // indices into ps + + for j := 0; j < 4; j++ { + if ps[j] != nil { + signs[j] = state[j] + *ps[j] = common.Poly{} // zero ps[j] + idx[j] = common.N - Tau + } else { + idx[j] = common.N // mark as completed + } + } + + stateOffset := 1 + for { + done := true + + PolyLoop: + for j := 0; j < 4; j++ { + if idx[j] == common.N { + continue + } + + for i := stateOffset; i < 17; i++ { + var bs [8]byte + binary.LittleEndian.PutUint64(bs[:], state[4*i+j]) + for k := 0; k < 8; k++ { + b := uint16(bs[k]) + + if b > idx[j] { + continue + } + + ps[j][idx[j]] = ps[j][b] + ps[j][b] = 1 + // Takes least significant bit of signs and uses it for the sign. + // Note 1 ^ (1 | (Q-1)) = Q-1. + ps[j][b] ^= uint32((-(signs[j] & 1)) & (1 | (common.Q - 1))) + signs[j] >>= 1 + + idx[j]++ + if idx[j] == common.N { + continue PolyLoop + } + } + } + + done = false + } + + if done { + break + } + + perm.Permute() + stateOffset = 0 + } +} + +// Samples p uniformly with τ non-zero coefficients in {q-1,1}. +// +// The polynomial p will be normalized. +func PolyDeriveUniformBall(p *common.Poly, seed []byte) { + var buf [136]byte // SHAKE-256 rate is 136 + + h := sha3.NewShake256() + _, _ = h.Write(seed[:]) + _, _ = h.Read(buf[:]) + + // Essentially we generate a sequence of τ ones or minus ones, + // prepend 196 zeroes and shuffle the concatenation using the + // usual algorithm (Fisher--Yates.) + signs := binary.LittleEndian.Uint64(buf[:]) + bufOff := 8 // offset into buf + + *p = common.Poly{} // zero p + for i := uint16(common.N - Tau); i < common.N; i++ { + var b uint16 + + // Find location of where to move the new coefficient to using + // rejection sampling. + for { + if bufOff >= 136 { + _, _ = h.Read(buf[:]) + bufOff = 0 + } + + b = uint16(buf[bufOff]) + bufOff++ + + if b <= i { + break + } + } + + p[i] = p[b] + p[b] = 1 + // Takes least significant bit of signs and uses it for the sign. + // Note 1 ^ (1 | (Q-1)) = Q-1. + p[b] ^= uint32((-(signs & 1)) & (1 | (common.Q - 1))) + signs >>= 1 + } +} diff --git a/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/vec.go b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/vec.go new file mode 100644 index 0000000000..d07d3b2458 --- /dev/null +++ b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/vec.go @@ -0,0 +1,281 @@ +// Code generated from mode3/internal/vec.go by gen.go + +package internal + +import ( + common "github.com/cloudflare/circl/sign/internal/dilithium" +) + +// A vector of L polynomials. +type VecL [L]common.Poly + +// A vector of K polynomials. +type VecK [K]common.Poly + +// Normalize the polynomials in this vector. +func (v *VecL) Normalize() { + for i := 0; i < L; i++ { + v[i].Normalize() + } +} + +// Normalize the polynomials in this vector assuming their coefficients +// are already bounded by 2q. +func (v *VecL) NormalizeAssumingLe2Q() { + for i := 0; i < L; i++ { + v[i].NormalizeAssumingLe2Q() + } +} + +// Sets v to w + u. Does not normalize. +func (v *VecL) Add(w, u *VecL) { + for i := 0; i < L; i++ { + v[i].Add(&w[i], &u[i]) + } +} + +// Applies NTT componentwise. See Poly.NTT() for details. +func (v *VecL) NTT() { + for i := 0; i < L; i++ { + v[i].NTT() + } +} + +// Checks whether any of the coefficients exceeds the given bound in supnorm +// +// Requires the vector to be normalized. +func (v *VecL) Exceeds(bound uint32) bool { + for i := 0; i < L; i++ { + if v[i].Exceeds(bound) { + return true + } + } + return false +} + +// Applies Poly.Power2Round componentwise. +// +// Requires the vector to be normalized. +func (v *VecL) Power2Round(v0PlusQ, v1 *VecL) { + for i := 0; i < L; i++ { + v[i].Power2Round(&v0PlusQ[i], &v1[i]) + } +} + +// Applies Poly.Decompose componentwise. +// +// Requires the vector to be normalized. +func (v *VecL) Decompose(v0PlusQ, v1 *VecL) { + for i := 0; i < L; i++ { + PolyDecompose(&v[i], &v0PlusQ[i], &v1[i]) + } +} + +// Sequentially packs each polynomial using Poly.PackLeqEta(). +func (v *VecL) PackLeqEta(buf []byte) { + offset := 0 + for i := 0; i < L; i++ { + PolyPackLeqEta(&v[i], buf[offset:]) + offset += PolyLeqEtaSize + } +} + +// Sets v to the polynomials packed in buf using VecL.PackLeqEta(). +func (v *VecL) UnpackLeqEta(buf []byte) { + offset := 0 + for i := 0; i < L; i++ { + PolyUnpackLeqEta(&v[i], buf[offset:]) + offset += PolyLeqEtaSize + } +} + +// Sequentially packs each polynomial using PolyPackLeGamma1(). +func (v *VecL) PackLeGamma1(buf []byte) { + offset := 0 + for i := 0; i < L; i++ { + PolyPackLeGamma1(&v[i], buf[offset:]) + offset += PolyLeGamma1Size + } +} + +// Sets v to the polynomials packed in buf using VecL.PackLeGamma1(). +func (v *VecL) UnpackLeGamma1(buf []byte) { + offset := 0 + for i := 0; i < L; i++ { + PolyUnpackLeGamma1(&v[i], buf[offset:]) + offset += PolyLeGamma1Size + } +} + +// Normalize the polynomials in this vector. +func (v *VecK) Normalize() { + for i := 0; i < K; i++ { + v[i].Normalize() + } +} + +// Normalize the polynomials in this vector assuming their coefficients +// are already bounded by 2q. +func (v *VecK) NormalizeAssumingLe2Q() { + for i := 0; i < K; i++ { + v[i].NormalizeAssumingLe2Q() + } +} + +// Sets v to w + u. Does not normalize. +func (v *VecK) Add(w, u *VecK) { + for i := 0; i < K; i++ { + v[i].Add(&w[i], &u[i]) + } +} + +// Checks whether any of the coefficients exceeds the given bound in supnorm +// +// Requires the vector to be normalized. +func (v *VecK) Exceeds(bound uint32) bool { + for i := 0; i < K; i++ { + if v[i].Exceeds(bound) { + return true + } + } + return false +} + +// Applies Poly.Power2Round componentwise. +// +// Requires the vector to be normalized. +func (v *VecK) Power2Round(v0PlusQ, v1 *VecK) { + for i := 0; i < K; i++ { + v[i].Power2Round(&v0PlusQ[i], &v1[i]) + } +} + +// Applies Poly.Decompose componentwise. +// +// Requires the vector to be normalized. +func (v *VecK) Decompose(v0PlusQ, v1 *VecK) { + for i := 0; i < K; i++ { + PolyDecompose(&v[i], &v0PlusQ[i], &v1[i]) + } +} + +// Sets v to the hint vector for v0 the modified low bits and v1 +// the unmodified high bits --- see makeHint(). +// +// Returns the number of ones in the hint vector. +func (v *VecK) MakeHint(v0, v1 *VecK) (pop uint32) { + for i := 0; i < K; i++ { + pop += PolyMakeHint(&v[i], &v0[i], &v1[i]) + } + return +} + +// Computes corrections to the high bits of the polynomials in the vector +// w using the hints in h and sets v to the corrected high bits. Returns v. +// See useHint(). +func (v *VecK) UseHint(q, hint *VecK) *VecK { + for i := 0; i < K; i++ { + PolyUseHint(&v[i], &q[i], &hint[i]) + } + return v +} + +// Sequentially packs each polynomial using Poly.PackT1(). +func (v *VecK) PackT1(buf []byte) { + offset := 0 + for i := 0; i < K; i++ { + v[i].PackT1(buf[offset:]) + offset += common.PolyT1Size + } +} + +// Sets v to the vector packed into buf by PackT1(). +func (v *VecK) UnpackT1(buf []byte) { + offset := 0 + for i := 0; i < K; i++ { + v[i].UnpackT1(buf[offset:]) + offset += common.PolyT1Size + } +} + +// Sequentially packs each polynomial using Poly.PackT0(). +func (v *VecK) PackT0(buf []byte) { + offset := 0 + for i := 0; i < K; i++ { + v[i].PackT0(buf[offset:]) + offset += common.PolyT0Size + } +} + +// Sets v to the vector packed into buf by PackT0(). +func (v *VecK) UnpackT0(buf []byte) { + offset := 0 + for i := 0; i < K; i++ { + v[i].UnpackT0(buf[offset:]) + offset += common.PolyT0Size + } +} + +// Sequentially packs each polynomial using Poly.PackLeqEta(). +func (v *VecK) PackLeqEta(buf []byte) { + offset := 0 + for i := 0; i < K; i++ { + PolyPackLeqEta(&v[i], buf[offset:]) + offset += PolyLeqEtaSize + } +} + +// Sets v to the polynomials packed in buf using VecK.PackLeqEta(). +func (v *VecK) UnpackLeqEta(buf []byte) { + offset := 0 + for i := 0; i < K; i++ { + PolyUnpackLeqEta(&v[i], buf[offset:]) + offset += PolyLeqEtaSize + } +} + +// Applies NTT componentwise. See Poly.NTT() for details. +func (v *VecK) NTT() { + for i := 0; i < K; i++ { + v[i].NTT() + } +} + +// Sequentially packs each polynomial using PolyPackW1(). +func (v *VecK) PackW1(buf []byte) { + offset := 0 + for i := 0; i < K; i++ { + PolyPackW1(&v[i], buf[offset:]) + offset += PolyW1Size + } +} + +// Sets v to a - b. +// +// Warning: assumes coefficients of the polynomials of b are less than 2q. +func (v *VecK) Sub(a, b *VecK) { + for i := 0; i < K; i++ { + v[i].Sub(&a[i], &b[i]) + } +} + +// Sets v to 2ᵈ w without reducing. +func (v *VecK) MulBy2toD(w *VecK) { + for i := 0; i < K; i++ { + v[i].MulBy2toD(&w[i]) + } +} + +// Applies InvNTT componentwise. See Poly.InvNTT() for details. +func (v *VecK) InvNTT() { + for i := 0; i < K; i++ { + v[i].InvNTT() + } +} + +// Applies Poly.ReduceLe2Q() componentwise. +func (v *VecK) ReduceLe2Q() { + for i := 0; i < K; i++ { + v[i].ReduceLe2Q() + } +} diff --git a/vendor/github.com/cloudflare/circl/sign/sign.go b/vendor/github.com/cloudflare/circl/sign/sign.go index 13b20fa4b0..557d6f0960 100644 --- a/vendor/github.com/cloudflare/circl/sign/sign.go +++ b/vendor/github.com/cloudflare/circl/sign/sign.go @@ -107,4 +107,7 @@ var ( // ErrContextNotSupported is the error used if a context is not // supported. ErrContextNotSupported = errors.New("context not supported") + + // ErrContextTooLong is the error used if the context string is too long. + ErrContextTooLong = errors.New("context string too long") ) diff --git a/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x.go b/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x.go new file mode 100644 index 0000000000..20ac96f006 --- /dev/null +++ b/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x.go @@ -0,0 +1,163 @@ +// Package keccakf1600 provides a two and four-way Keccak-f[1600] permutation in parallel. +// +// Keccak-f[1600] is the permutation underlying several algorithms such as +// Keccak, SHA3 and SHAKE. Running two or four permutations in parallel is +// useful in some scenarios like in hash-based signatures. +// +// # Limitations +// +// Note that not all the architectures support SIMD instructions. This package +// uses AVX2 instructions that are available in some AMD64 architectures +// and NEON instructions that are available in some ARM64 architectures. +// +// For those systems not supporting these, the package still provides the +// expected functionality by means of a generic and slow implementation. +// The recommendation is to beforehand verify IsEnabledX4() and IsEnabledX2() +// to determine if the current system supports the SIMD implementation. +package keccakf1600 + +import ( + "runtime" + "unsafe" + + "github.com/cloudflare/circl/internal/sha3" + "golang.org/x/sys/cpu" +) + +// StateX4 contains state for the four-way permutation including the four +// interleaved [25]uint64 buffers. Call Initialize() before use to initialize +// and get a pointer to the interleaved buffer. +type StateX4 struct { + // Go guarantees a to be aligned on 8 bytes, whereas we need it to be + // aligned on 32 bytes for bet performance. Thus we leave some headroom + // to be able to move the start of the state. + + // 4 x 25 uint64s for the interleaved states and three uint64s headroom + // to fix alignment. + a [103]uint64 + + // Offset into a that is 32 byte aligned. + offset int + + // If true, permute will use 12-round keccak instead of 24-round keccak + turbo bool +} + +// StateX2 contains state for the two-way permutation including the two +// interleaved [25]uint64 buffers. Call Initialize() before use to initialize +// and get a pointer to the interleaved buffer. +type StateX2 struct { + // Go guarantees a to be aligned on 8 bytes, whereas we need it to be + // aligned on 32 bytes for bet performance. Thus we leave some headroom + // to be able to move the start of the state. + + // 2 x 25 uint64s for the interleaved states and three uint64s headroom + // to fix alignment. + a [53]uint64 + + // Offset into a that is 32 byte aligned. + offset int + + // If true, permute will use 12-round keccak instead of 24-round keccak + turbo bool +} + +// IsEnabledX4 returns true if the architecture supports a four-way SIMD +// implementation provided in this package. +func IsEnabledX4() bool { return cpu.X86.HasAVX2 } + +// IsEnabledX2 returns true if the architecture supports a two-way SIMD +// implementation provided in this package. +func IsEnabledX2() bool { return enabledX2 } + +// Initialize the state and returns the buffer on which the four permutations +// will act: a uint64 slice of length 100. The first permutation will act +// on {a[0], a[4], ..., a[96]}, the second on {a[1], a[5], ..., a[97]}, etc. +// If turbo is true, applies 12-round variant instead of the usual 24. +func (s *StateX4) Initialize(turbo bool) []uint64 { + s.turbo = turbo + rp := unsafe.Pointer(&s.a[0]) + + // uint64s are always aligned by a multiple of 8. Compute the remainder + // of the address modulo 32 divided by 8. + rem := (int(uintptr(rp)&31) >> 3) + + if rem != 0 { + s.offset = 4 - rem + } + + // The slice we return will be aligned on 32 byte boundary. + return s.a[s.offset : s.offset+100] +} + +// Initialize the state and returns the buffer on which the two permutations +// will act: a uint64 slice of length 50. The first permutation will act +// on {a[0], a[2], ..., a[48]} and the second on {a[1], a[3], ..., a[49]}. +// If turbo is true, applies 12-round variant instead of the usual 24. +func (s *StateX2) Initialize(turbo bool) []uint64 { + s.turbo = turbo + rp := unsafe.Pointer(&s.a[0]) + + // uint64s are always aligned by a multiple of 8. Compute the remainder + // of the address modulo 32 divided by 8. + rem := (int(uintptr(rp)&31) >> 3) + + if rem != 0 { + s.offset = 4 - rem + } + + // The slice we return will be aligned on 32 byte boundary. + return s.a[s.offset : s.offset+50] +} + +// Permute performs the four parallel Keccak-f[1600]s interleaved on the slice +// returned from Initialize(). +func (s *StateX4) Permute() { + if IsEnabledX4() { + permuteSIMDx4(s.a[s.offset:], s.turbo) + } else { + permuteScalarX4(s.a[s.offset:], s.turbo) // A slower generic implementation. + } +} + +// Permute performs the two parallel Keccak-f[1600]s interleaved on the slice +// returned from Initialize(). +func (s *StateX2) Permute() { + if IsEnabledX2() { + permuteSIMDx2(s.a[s.offset:], s.turbo) + } else { + permuteScalarX2(s.a[s.offset:], s.turbo) // A slower generic implementation. + } +} + +func permuteScalarX4(a []uint64, turbo bool) { + var buf [25]uint64 + for i := 0; i < 4; i++ { + for j := 0; j < 25; j++ { + buf[j] = a[4*j+i] + } + sha3.KeccakF1600(&buf, turbo) + for j := 0; j < 25; j++ { + a[4*j+i] = buf[j] + } + } +} + +func permuteScalarX2(a []uint64, turbo bool) { + var buf [25]uint64 + for i := 0; i < 2; i++ { + for j := 0; j < 25; j++ { + buf[j] = a[2*j+i] + } + sha3.KeccakF1600(&buf, turbo) + for j := 0; j < 25; j++ { + a[2*j+i] = buf[j] + } + } +} + +var enabledX2 bool + +func init() { + enabledX2 = runtime.GOARCH == "arm64" && runtime.GOOS == "darwin" +} diff --git a/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x2_arm64.go b/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x2_arm64.go new file mode 100644 index 0000000000..0cb9692c32 --- /dev/null +++ b/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x2_arm64.go @@ -0,0 +1,13 @@ +//go:build arm64 && go1.16 && !purego +// +build arm64,go1.16,!purego + +package keccakf1600 + +import "github.com/cloudflare/circl/internal/sha3" + +func permuteSIMDx2(state []uint64, turbo bool) { f1600x2ARM(&state[0], &sha3.RC, turbo) } + +func permuteSIMDx4(state []uint64, turbo bool) { permuteScalarX4(state, turbo) } + +//go:noescape +func f1600x2ARM(state *uint64, rc *[24]uint64, turbo bool) diff --git a/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x2_arm64.s b/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x2_arm64.s new file mode 100644 index 0000000000..998aeca5b4 --- /dev/null +++ b/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x2_arm64.s @@ -0,0 +1,136 @@ +// +build arm64,go1.16,!purego + +// Taken from https://github.com/bwesterb/armed-keccak + +#include "textflag.h" + +// func f1600x2ARM(state *uint64, rc *[24]uint64, turbo bool) +TEXT ·f1600x2ARM(SB), NOSPLIT, $0-17 + MOVD state+0(FP), R0 + MOVD rc+8(FP), R1 + MOVD R0, R2 + MOVD $24, R3 + + VLD1.P 64(R0), [ V0.B16, V1.B16, V2.B16, V3.B16] + VLD1.P 64(R0), [ V4.B16, V5.B16, V6.B16, V7.B16] + VLD1.P 64(R0), [ V8.B16, V9.B16, V10.B16, V11.B16] + VLD1.P 64(R0), [V12.B16, V13.B16, V14.B16, V15.B16] + VLD1.P 64(R0), [V16.B16, V17.B16, V18.B16, V19.B16] + VLD1.P 64(R0), [V20.B16, V21.B16, V22.B16, V23.B16] + VLD1.P (R0), [V24.B16] + + MOVBU turbo+16(FP), R4 + CBZ R4, loop + + SUB $12, R3, R3 + ADD $96, R1, R1 + +loop: + // Execute theta but without xorring into the state yet. + VEOR3 V10.B16, V5.B16, V0.B16, V25.B16 + VEOR3 V11.B16, V6.B16, V1.B16, V26.B16 + VEOR3 V12.B16, V7.B16, V2.B16, V27.B16 + VEOR3 V13.B16, V8.B16, V3.B16, V28.B16 + VEOR3 V14.B16, V9.B16, V4.B16, V29.B16 + + VEOR3 V20.B16, V15.B16, V25.B16, V25.B16 + VEOR3 V21.B16, V16.B16, V26.B16, V26.B16 + VEOR3 V22.B16, V17.B16, V27.B16, V27.B16 + VEOR3 V23.B16, V18.B16, V28.B16, V28.B16 + VEOR3 V24.B16, V19.B16, V29.B16, V29.B16 + + // Xor parities from step theta into the state at the same time as + // exeuting rho and pi. + VRAX1 V26.D2, V29.D2, V30.D2 + VRAX1 V29.D2, V27.D2, V29.D2 + VRAX1 V27.D2, V25.D2, V27.D2 + VRAX1 V25.D2, V28.D2, V25.D2 + VRAX1 V28.D2, V26.D2, V28.D2 + + VEOR V30.B16, V0.B16, V0.B16 + VMOV V1.B16, V31.B16 + + VXAR $20, V27.D2, V6.D2, V1.D2 + VXAR $44, V25.D2, V9.D2, V6.D2 + VXAR $3 , V28.D2, V22.D2, V9.D2 + VXAR $25, V25.D2, V14.D2, V22.D2 + VXAR $46, V30.D2, V20.D2, V14.D2 + VXAR $2 , V28.D2, V2.D2, V20.D2 + VXAR $21, V28.D2, V12.D2, V2.D2 + VXAR $39, V29.D2, V13.D2, V12.D2 + VXAR $56, V25.D2, V19.D2, V13.D2 + VXAR $8 , V29.D2, V23.D2, V19.D2 + VXAR $23, V30.D2, V15.D2, V23.D2 + VXAR $37, V25.D2, V4.D2, V15.D2 + VXAR $50, V25.D2, V24.D2, V4.D2 + VXAR $62, V27.D2, V21.D2, V24.D2 + VXAR $9 , V29.D2, V8.D2, V21.D2 + VXAR $19, V27.D2, V16.D2, V8.D2 + VXAR $28, V30.D2, V5.D2, V16.D2 + VXAR $36, V29.D2, V3.D2, V5.D2 + VXAR $43, V29.D2, V18.D2, V3.D2 + VXAR $49, V28.D2, V17.D2, V18.D2 + VXAR $54, V27.D2, V11.D2, V17.D2 + VXAR $58, V28.D2, V7.D2, V11.D2 + VXAR $61, V30.D2, V10.D2, V7.D2 + VXAR $63, V27.D2, V31.D2, V10.D2 + + // Chi + VBCAX V1.B16, V2.B16, V0.B16, V25.B16 + VBCAX V2.B16, V3.B16, V1.B16, V26.B16 + VBCAX V3.B16, V4.B16, V2.B16, V2.B16 + VBCAX V4.B16, V0.B16, V3.B16, V3.B16 + VBCAX V0.B16, V1.B16, V4.B16, V4.B16 + VMOV V25.B16, V0.B16 + VMOV V26.B16, V1.B16 + + VBCAX V6.B16, V7.B16, V5.B16, V25.B16 + VBCAX V7.B16, V8.B16, V6.B16, V26.B16 + VBCAX V8.B16, V9.B16, V7.B16, V7.B16 + VBCAX V9.B16, V5.B16, V8.B16, V8.B16 + VBCAX V5.B16, V6.B16, V9.B16, V9.B16 + VMOV V25.B16, V5.B16 + VMOV V26.B16, V6.B16 + + VBCAX V11.B16, V12.B16, V10.B16, V25.B16 + VBCAX V12.B16, V13.B16, V11.B16, V26.B16 + VBCAX V13.B16, V14.B16, V12.B16, V12.B16 + VBCAX V14.B16, V10.B16, V13.B16, V13.B16 + VBCAX V10.B16, V11.B16, V14.B16, V14.B16 + VMOV V25.B16, V10.B16 + VMOV V26.B16, V11.B16 + + VBCAX V16.B16, V17.B16, V15.B16, V25.B16 + VBCAX V17.B16, V18.B16, V16.B16, V26.B16 + VBCAX V18.B16, V19.B16, V17.B16, V17.B16 + VBCAX V19.B16, V15.B16, V18.B16, V18.B16 + VBCAX V15.B16, V16.B16, V19.B16, V19.B16 + VMOV V25.B16, V15.B16 + VMOV V26.B16, V16.B16 + + VBCAX V21.B16, V22.B16, V20.B16, V25.B16 + VBCAX V22.B16, V23.B16, V21.B16, V26.B16 + VBCAX V23.B16, V24.B16, V22.B16, V22.B16 + VBCAX V24.B16, V20.B16, V23.B16, V23.B16 + VBCAX V20.B16, V21.B16, V24.B16, V24.B16 + VMOV V25.B16, V20.B16 + VMOV V26.B16, V21.B16 + + // Iota + VLD1R.P 8(R1), [V25.D2] + VEOR V25.B16, V0.B16, V0.B16 + + SUBS $1, R3, R3 + CBNZ R3, loop + + MOVD R2, R0 + + VST1.P [ V0.B16, V1.B16, V2.B16, V3.B16], 64(R0) + VST1.P [ V4.B16, V5.B16, V6.B16, V7.B16], 64(R0) + VST1.P [ V8.B16, V9.B16, V10.B16, V11.B16], 64(R0) + VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R0) + VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R0) + VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R0) + VST1.P [V24.B16], (R0) + + RET diff --git a/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x4_amd64.go b/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x4_amd64.go new file mode 100644 index 0000000000..bf5b865d0b --- /dev/null +++ b/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x4_amd64.go @@ -0,0 +1,10 @@ +//go:build amd64 && !purego +// +build amd64,!purego + +package keccakf1600 + +import "github.com/cloudflare/circl/internal/sha3" + +func permuteSIMDx4(state []uint64, turbo bool) { f1600x4AVX2(&state[0], &sha3.RC, turbo) } + +func permuteSIMDx2(state []uint64, turbo bool) { permuteScalarX2(state, turbo) } diff --git a/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x4_amd64.s b/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x4_amd64.s new file mode 100644 index 0000000000..67b64550c2 --- /dev/null +++ b/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x4_amd64.s @@ -0,0 +1,899 @@ +// Code generated by command: go run src.go -out ../../f1600x4_amd64.s -stubs ../../f1600x4stubs_amd64.go -pkg keccakf1600. DO NOT EDIT. + +//go:build amd64 && !purego + +#include "textflag.h" + +// func f1600x4AVX2(state *uint64, rc *[24]uint64, turbo bool) +// Requires: AVX, AVX2 +TEXT ·f1600x4AVX2(SB), NOSPLIT, $0-17 + MOVQ state+0(FP), AX + MOVQ rc+8(FP), CX + MOVQ $0x0000000000000006, DX + MOVBQZX turbo+16(FP), BX + TESTQ BX, BX + JZ loop + MOVQ $0x0000000000000003, DX + ADDQ $0x60, CX + +loop: + VMOVDQA (AX), Y0 + VMOVDQA 32(AX), Y1 + VMOVDQA 64(AX), Y2 + VMOVDQA 96(AX), Y3 + VMOVDQA 128(AX), Y4 + VPXOR 160(AX), Y0, Y0 + VPXOR 192(AX), Y1, Y1 + VPXOR 224(AX), Y2, Y2 + VPXOR 256(AX), Y3, Y3 + VPXOR 288(AX), Y4, Y4 + VPXOR 320(AX), Y0, Y0 + VPXOR 352(AX), Y1, Y1 + VPXOR 384(AX), Y2, Y2 + VPXOR 416(AX), Y3, Y3 + VPXOR 448(AX), Y4, Y4 + VPXOR 480(AX), Y0, Y0 + VPXOR 512(AX), Y1, Y1 + VPXOR 544(AX), Y2, Y2 + VPXOR 576(AX), Y3, Y3 + VPXOR 608(AX), Y4, Y4 + VPXOR 640(AX), Y0, Y0 + VPXOR 672(AX), Y1, Y1 + VPXOR 704(AX), Y2, Y2 + VPXOR 736(AX), Y3, Y3 + VPXOR 768(AX), Y4, Y4 + VPSLLQ $0x01, Y1, Y5 + VPSLLQ $0x01, Y2, Y6 + VPSLLQ $0x01, Y3, Y7 + VPSLLQ $0x01, Y4, Y8 + VPSLLQ $0x01, Y0, Y9 + VPSRLQ $0x3f, Y1, Y10 + VPSRLQ $0x3f, Y2, Y11 + VPSRLQ $0x3f, Y3, Y12 + VPSRLQ $0x3f, Y4, Y13 + VPSRLQ $0x3f, Y0, Y14 + VPOR Y5, Y10, Y10 + VPOR Y6, Y11, Y11 + VPOR Y7, Y12, Y12 + VPOR Y8, Y13, Y13 + VPOR Y9, Y14, Y14 + VPXOR Y10, Y4, Y10 + VPXOR Y11, Y0, Y11 + VPXOR Y12, Y1, Y12 + VPXOR Y13, Y2, Y13 + VPXOR Y14, Y3, Y14 + VPXOR (AX), Y10, Y0 + VPXOR 192(AX), Y11, Y1 + VPXOR 384(AX), Y12, Y2 + VPXOR 576(AX), Y13, Y3 + VPXOR 768(AX), Y14, Y4 + VPSLLQ $0x2c, Y1, Y6 + VPSLLQ $0x2b, Y2, Y7 + VPSLLQ $0x15, Y3, Y8 + VPSLLQ $0x0e, Y4, Y9 + VPSRLQ $0x14, Y1, Y1 + VPSRLQ $0x15, Y2, Y2 + VPSRLQ $0x2b, Y3, Y3 + VPSRLQ $0x32, Y4, Y4 + VPOR Y6, Y1, Y1 + VPOR Y7, Y2, Y2 + VPOR Y8, Y3, Y3 + VPOR Y9, Y4, Y4 + VPANDN Y2, Y1, Y5 + VPANDN Y3, Y2, Y6 + VPANDN Y4, Y3, Y7 + VPANDN Y0, Y4, Y8 + VPANDN Y1, Y0, Y9 + VPXOR Y0, Y5, Y5 + VPXOR Y1, Y6, Y6 + VPXOR Y2, Y7, Y7 + VPXOR Y3, Y8, Y8 + VPXOR Y4, Y9, Y9 + VPBROADCASTQ (CX), Y0 + VPXOR Y0, Y5, Y5 + VMOVDQA Y5, (AX) + VMOVDQA Y6, 192(AX) + VMOVDQA Y7, 384(AX) + VMOVDQA Y8, 576(AX) + VMOVDQA Y9, 768(AX) + VPXOR 96(AX), Y13, Y0 + VPXOR 288(AX), Y14, Y1 + VPXOR 320(AX), Y10, Y2 + VPXOR 512(AX), Y11, Y3 + VPXOR 704(AX), Y12, Y4 + VPSLLQ $0x1c, Y0, Y5 + VPSLLQ $0x14, Y1, Y6 + VPSLLQ $0x03, Y2, Y7 + VPSLLQ $0x2d, Y3, Y8 + VPSLLQ $0x3d, Y4, Y9 + VPSRLQ $0x24, Y0, Y0 + VPSRLQ $0x2c, Y1, Y1 + VPSRLQ $0x3d, Y2, Y2 + VPSRLQ $0x13, Y3, Y3 + VPSRLQ $0x03, Y4, Y4 + VPOR Y5, Y0, Y0 + VPOR Y6, Y1, Y1 + VPOR Y7, Y2, Y2 + VPOR Y8, Y3, Y3 + VPOR Y9, Y4, Y4 + VPANDN Y2, Y1, Y5 + VPANDN Y3, Y2, Y6 + VPANDN Y4, Y3, Y7 + VPANDN Y0, Y4, Y8 + VPANDN Y1, Y0, Y9 + VPXOR Y0, Y5, Y5 + VPXOR Y1, Y6, Y6 + VPXOR Y2, Y7, Y7 + VPXOR Y3, Y8, Y8 + VPXOR Y4, Y9, Y9 + VMOVDQA Y5, 320(AX) + VMOVDQA Y6, 512(AX) + VMOVDQA Y7, 704(AX) + VMOVDQA Y8, 96(AX) + VMOVDQA Y9, 288(AX) + VPXOR 32(AX), Y11, Y0 + VPXOR 224(AX), Y12, Y1 + VPXOR 416(AX), Y13, Y2 + VPXOR 608(AX), Y14, Y3 + VPXOR 640(AX), Y10, Y4 + VPSLLQ $0x01, Y0, Y5 + VPSLLQ $0x06, Y1, Y6 + VPSLLQ $0x19, Y2, Y7 + VPSLLQ $0x08, Y3, Y8 + VPSLLQ $0x12, Y4, Y9 + VPSRLQ $0x3f, Y0, Y0 + VPSRLQ $0x3a, Y1, Y1 + VPSRLQ $0x27, Y2, Y2 + VPSRLQ $0x38, Y3, Y3 + VPSRLQ $0x2e, Y4, Y4 + VPOR Y5, Y0, Y0 + VPOR Y6, Y1, Y1 + VPOR Y7, Y2, Y2 + VPOR Y8, Y3, Y3 + VPOR Y9, Y4, Y4 + VPANDN Y2, Y1, Y5 + VPANDN Y3, Y2, Y6 + VPANDN Y4, Y3, Y7 + VPANDN Y0, Y4, Y8 + VPANDN Y1, Y0, Y9 + VPXOR Y0, Y5, Y5 + VPXOR Y1, Y6, Y6 + VPXOR Y2, Y7, Y7 + VPXOR Y3, Y8, Y8 + VPXOR Y4, Y9, Y9 + VMOVDQA Y5, 640(AX) + VMOVDQA Y6, 32(AX) + VMOVDQA Y7, 224(AX) + VMOVDQA Y8, 416(AX) + VMOVDQA Y9, 608(AX) + VPXOR 128(AX), Y14, Y0 + VPXOR 160(AX), Y10, Y1 + VPXOR 352(AX), Y11, Y2 + VPXOR 544(AX), Y12, Y3 + VPXOR 736(AX), Y13, Y4 + VPSLLQ $0x1b, Y0, Y5 + VPSLLQ $0x24, Y1, Y6 + VPSLLQ $0x0a, Y2, Y7 + VPSLLQ $0x0f, Y3, Y8 + VPSLLQ $0x38, Y4, Y9 + VPSRLQ $0x25, Y0, Y0 + VPSRLQ $0x1c, Y1, Y1 + VPSRLQ $0x36, Y2, Y2 + VPSRLQ $0x31, Y3, Y3 + VPSRLQ $0x08, Y4, Y4 + VPOR Y5, Y0, Y0 + VPOR Y6, Y1, Y1 + VPOR Y7, Y2, Y2 + VPOR Y8, Y3, Y3 + VPOR Y9, Y4, Y4 + VPANDN Y2, Y1, Y5 + VPANDN Y3, Y2, Y6 + VPANDN Y4, Y3, Y7 + VPANDN Y0, Y4, Y8 + VPANDN Y1, Y0, Y9 + VPXOR Y0, Y5, Y5 + VPXOR Y1, Y6, Y6 + VPXOR Y2, Y7, Y7 + VPXOR Y3, Y8, Y8 + VPXOR Y4, Y9, Y9 + VMOVDQA Y5, 160(AX) + VMOVDQA Y6, 352(AX) + VMOVDQA Y7, 544(AX) + VMOVDQA Y8, 736(AX) + VMOVDQA Y9, 128(AX) + VPXOR 64(AX), Y12, Y0 + VPXOR 256(AX), Y13, Y1 + VPXOR 448(AX), Y14, Y2 + VPXOR 480(AX), Y10, Y3 + VPXOR 672(AX), Y11, Y4 + VPSLLQ $0x3e, Y0, Y5 + VPSLLQ $0x37, Y1, Y6 + VPSLLQ $0x27, Y2, Y7 + VPSLLQ $0x29, Y3, Y8 + VPSLLQ $0x02, Y4, Y9 + VPSRLQ $0x02, Y0, Y0 + VPSRLQ $0x09, Y1, Y1 + VPSRLQ $0x19, Y2, Y2 + VPSRLQ $0x17, Y3, Y3 + VPSRLQ $0x3e, Y4, Y4 + VPOR Y5, Y0, Y0 + VPOR Y6, Y1, Y1 + VPOR Y7, Y2, Y2 + VPOR Y8, Y3, Y3 + VPOR Y9, Y4, Y4 + VPANDN Y2, Y1, Y5 + VPANDN Y3, Y2, Y6 + VPANDN Y4, Y3, Y7 + VPANDN Y0, Y4, Y8 + VPANDN Y1, Y0, Y9 + VPXOR Y0, Y5, Y5 + VPXOR Y1, Y6, Y6 + VPXOR Y2, Y7, Y7 + VPXOR Y3, Y8, Y8 + VPXOR Y4, Y9, Y9 + VMOVDQA Y5, 480(AX) + VMOVDQA Y6, 672(AX) + VMOVDQA Y7, 64(AX) + VMOVDQA Y8, 256(AX) + VMOVDQA Y9, 448(AX) + VMOVDQA (AX), Y0 + VMOVDQA 32(AX), Y1 + VMOVDQA 64(AX), Y2 + VMOVDQA 96(AX), Y3 + VMOVDQA 128(AX), Y4 + VPXOR 160(AX), Y0, Y0 + VPXOR 192(AX), Y1, Y1 + VPXOR 224(AX), Y2, Y2 + VPXOR 256(AX), Y3, Y3 + VPXOR 288(AX), Y4, Y4 + VPXOR 320(AX), Y0, Y0 + VPXOR 352(AX), Y1, Y1 + VPXOR 384(AX), Y2, Y2 + VPXOR 416(AX), Y3, Y3 + VPXOR 448(AX), Y4, Y4 + VPXOR 480(AX), Y0, Y0 + VPXOR 512(AX), Y1, Y1 + VPXOR 544(AX), Y2, Y2 + VPXOR 576(AX), Y3, Y3 + VPXOR 608(AX), Y4, Y4 + VPXOR 640(AX), Y0, Y0 + VPXOR 672(AX), Y1, Y1 + VPXOR 704(AX), Y2, Y2 + VPXOR 736(AX), Y3, Y3 + VPXOR 768(AX), Y4, Y4 + VPSLLQ $0x01, Y1, Y5 + VPSLLQ $0x01, Y2, Y6 + VPSLLQ $0x01, Y3, Y7 + VPSLLQ $0x01, Y4, Y8 + VPSLLQ $0x01, Y0, Y9 + VPSRLQ $0x3f, Y1, Y10 + VPSRLQ $0x3f, Y2, Y11 + VPSRLQ $0x3f, Y3, Y12 + VPSRLQ $0x3f, Y4, Y13 + VPSRLQ $0x3f, Y0, Y14 + VPOR Y5, Y10, Y10 + VPOR Y6, Y11, Y11 + VPOR Y7, Y12, Y12 + VPOR Y8, Y13, Y13 + VPOR Y9, Y14, Y14 + VPXOR Y10, Y4, Y10 + VPXOR Y11, Y0, Y11 + VPXOR Y12, Y1, Y12 + VPXOR Y13, Y2, Y13 + VPXOR Y14, Y3, Y14 + VPXOR (AX), Y10, Y0 + VPXOR 512(AX), Y11, Y1 + VPXOR 224(AX), Y12, Y2 + VPXOR 736(AX), Y13, Y3 + VPXOR 448(AX), Y14, Y4 + VPSLLQ $0x2c, Y1, Y6 + VPSLLQ $0x2b, Y2, Y7 + VPSLLQ $0x15, Y3, Y8 + VPSLLQ $0x0e, Y4, Y9 + VPSRLQ $0x14, Y1, Y1 + VPSRLQ $0x15, Y2, Y2 + VPSRLQ $0x2b, Y3, Y3 + VPSRLQ $0x32, Y4, Y4 + VPOR Y6, Y1, Y1 + VPOR Y7, Y2, Y2 + VPOR Y8, Y3, Y3 + VPOR Y9, Y4, Y4 + VPANDN Y2, Y1, Y5 + VPANDN Y3, Y2, Y6 + VPANDN Y4, Y3, Y7 + VPANDN Y0, Y4, Y8 + VPANDN Y1, Y0, Y9 + VPXOR Y0, Y5, Y5 + VPXOR Y1, Y6, Y6 + VPXOR Y2, Y7, Y7 + VPXOR Y3, Y8, Y8 + VPXOR Y4, Y9, Y9 + VPBROADCASTQ 8(CX), Y0 + VPXOR Y0, Y5, Y5 + VMOVDQA Y5, (AX) + VMOVDQA Y6, 512(AX) + VMOVDQA Y7, 224(AX) + VMOVDQA Y8, 736(AX) + VMOVDQA Y9, 448(AX) + VPXOR 576(AX), Y13, Y0 + VPXOR 288(AX), Y14, Y1 + VPXOR 640(AX), Y10, Y2 + VPXOR 352(AX), Y11, Y3 + VPXOR 64(AX), Y12, Y4 + VPSLLQ $0x1c, Y0, Y5 + VPSLLQ $0x14, Y1, Y6 + VPSLLQ $0x03, Y2, Y7 + VPSLLQ $0x2d, Y3, Y8 + VPSLLQ $0x3d, Y4, Y9 + VPSRLQ $0x24, Y0, Y0 + VPSRLQ $0x2c, Y1, Y1 + VPSRLQ $0x3d, Y2, Y2 + VPSRLQ $0x13, Y3, Y3 + VPSRLQ $0x03, Y4, Y4 + VPOR Y5, Y0, Y0 + VPOR Y6, Y1, Y1 + VPOR Y7, Y2, Y2 + VPOR Y8, Y3, Y3 + VPOR Y9, Y4, Y4 + VPANDN Y2, Y1, Y5 + VPANDN Y3, Y2, Y6 + VPANDN Y4, Y3, Y7 + VPANDN Y0, Y4, Y8 + VPANDN Y1, Y0, Y9 + VPXOR Y0, Y5, Y5 + VPXOR Y1, Y6, Y6 + VPXOR Y2, Y7, Y7 + VPXOR Y3, Y8, Y8 + VPXOR Y4, Y9, Y9 + VMOVDQA Y5, 640(AX) + VMOVDQA Y6, 352(AX) + VMOVDQA Y7, 64(AX) + VMOVDQA Y8, 576(AX) + VMOVDQA Y9, 288(AX) + VPXOR 192(AX), Y11, Y0 + VPXOR 704(AX), Y12, Y1 + VPXOR 416(AX), Y13, Y2 + VPXOR 128(AX), Y14, Y3 + VPXOR 480(AX), Y10, Y4 + VPSLLQ $0x01, Y0, Y5 + VPSLLQ $0x06, Y1, Y6 + VPSLLQ $0x19, Y2, Y7 + VPSLLQ $0x08, Y3, Y8 + VPSLLQ $0x12, Y4, Y9 + VPSRLQ $0x3f, Y0, Y0 + VPSRLQ $0x3a, Y1, Y1 + VPSRLQ $0x27, Y2, Y2 + VPSRLQ $0x38, Y3, Y3 + VPSRLQ $0x2e, Y4, Y4 + VPOR Y5, Y0, Y0 + VPOR Y6, Y1, Y1 + VPOR Y7, Y2, Y2 + VPOR Y8, Y3, Y3 + VPOR Y9, Y4, Y4 + VPANDN Y2, Y1, Y5 + VPANDN Y3, Y2, Y6 + VPANDN Y4, Y3, Y7 + VPANDN Y0, Y4, Y8 + VPANDN Y1, Y0, Y9 + VPXOR Y0, Y5, Y5 + VPXOR Y1, Y6, Y6 + VPXOR Y2, Y7, Y7 + VPXOR Y3, Y8, Y8 + VPXOR Y4, Y9, Y9 + VMOVDQA Y5, 480(AX) + VMOVDQA Y6, 192(AX) + VMOVDQA Y7, 704(AX) + VMOVDQA Y8, 416(AX) + VMOVDQA Y9, 128(AX) + VPXOR 768(AX), Y14, Y0 + VPXOR 320(AX), Y10, Y1 + VPXOR 32(AX), Y11, Y2 + VPXOR 544(AX), Y12, Y3 + VPXOR 256(AX), Y13, Y4 + VPSLLQ $0x1b, Y0, Y5 + VPSLLQ $0x24, Y1, Y6 + VPSLLQ $0x0a, Y2, Y7 + VPSLLQ $0x0f, Y3, Y8 + VPSLLQ $0x38, Y4, Y9 + VPSRLQ $0x25, Y0, Y0 + VPSRLQ $0x1c, Y1, Y1 + VPSRLQ $0x36, Y2, Y2 + VPSRLQ $0x31, Y3, Y3 + VPSRLQ $0x08, Y4, Y4 + VPOR Y5, Y0, Y0 + VPOR Y6, Y1, Y1 + VPOR Y7, Y2, Y2 + VPOR Y8, Y3, Y3 + VPOR Y9, Y4, Y4 + VPANDN Y2, Y1, Y5 + VPANDN Y3, Y2, Y6 + VPANDN Y4, Y3, Y7 + VPANDN Y0, Y4, Y8 + VPANDN Y1, Y0, Y9 + VPXOR Y0, Y5, Y5 + VPXOR Y1, Y6, Y6 + VPXOR Y2, Y7, Y7 + VPXOR Y3, Y8, Y8 + VPXOR Y4, Y9, Y9 + VMOVDQA Y5, 320(AX) + VMOVDQA Y6, 32(AX) + VMOVDQA Y7, 544(AX) + VMOVDQA Y8, 256(AX) + VMOVDQA Y9, 768(AX) + VPXOR 384(AX), Y12, Y0 + VPXOR 96(AX), Y13, Y1 + VPXOR 608(AX), Y14, Y2 + VPXOR 160(AX), Y10, Y3 + VPXOR 672(AX), Y11, Y4 + VPSLLQ $0x3e, Y0, Y5 + VPSLLQ $0x37, Y1, Y6 + VPSLLQ $0x27, Y2, Y7 + VPSLLQ $0x29, Y3, Y8 + VPSLLQ $0x02, Y4, Y9 + VPSRLQ $0x02, Y0, Y0 + VPSRLQ $0x09, Y1, Y1 + VPSRLQ $0x19, Y2, Y2 + VPSRLQ $0x17, Y3, Y3 + VPSRLQ $0x3e, Y4, Y4 + VPOR Y5, Y0, Y0 + VPOR Y6, Y1, Y1 + VPOR Y7, Y2, Y2 + VPOR Y8, Y3, Y3 + VPOR Y9, Y4, Y4 + VPANDN Y2, Y1, Y5 + VPANDN Y3, Y2, Y6 + VPANDN Y4, Y3, Y7 + VPANDN Y0, Y4, Y8 + VPANDN Y1, Y0, Y9 + VPXOR Y0, Y5, Y5 + VPXOR Y1, Y6, Y6 + VPXOR Y2, Y7, Y7 + VPXOR Y3, Y8, Y8 + VPXOR Y4, Y9, Y9 + VMOVDQA Y5, 160(AX) + VMOVDQA Y6, 672(AX) + VMOVDQA Y7, 384(AX) + VMOVDQA Y8, 96(AX) + VMOVDQA Y9, 608(AX) + VMOVDQA (AX), Y0 + VMOVDQA 32(AX), Y1 + VMOVDQA 64(AX), Y2 + VMOVDQA 96(AX), Y3 + VMOVDQA 128(AX), Y4 + VPXOR 160(AX), Y0, Y0 + VPXOR 192(AX), Y1, Y1 + VPXOR 224(AX), Y2, Y2 + VPXOR 256(AX), Y3, Y3 + VPXOR 288(AX), Y4, Y4 + VPXOR 320(AX), Y0, Y0 + VPXOR 352(AX), Y1, Y1 + VPXOR 384(AX), Y2, Y2 + VPXOR 416(AX), Y3, Y3 + VPXOR 448(AX), Y4, Y4 + VPXOR 480(AX), Y0, Y0 + VPXOR 512(AX), Y1, Y1 + VPXOR 544(AX), Y2, Y2 + VPXOR 576(AX), Y3, Y3 + VPXOR 608(AX), Y4, Y4 + VPXOR 640(AX), Y0, Y0 + VPXOR 672(AX), Y1, Y1 + VPXOR 704(AX), Y2, Y2 + VPXOR 736(AX), Y3, Y3 + VPXOR 768(AX), Y4, Y4 + VPSLLQ $0x01, Y1, Y5 + VPSLLQ $0x01, Y2, Y6 + VPSLLQ $0x01, Y3, Y7 + VPSLLQ $0x01, Y4, Y8 + VPSLLQ $0x01, Y0, Y9 + VPSRLQ $0x3f, Y1, Y10 + VPSRLQ $0x3f, Y2, Y11 + VPSRLQ $0x3f, Y3, Y12 + VPSRLQ $0x3f, Y4, Y13 + VPSRLQ $0x3f, Y0, Y14 + VPOR Y5, Y10, Y10 + VPOR Y6, Y11, Y11 + VPOR Y7, Y12, Y12 + VPOR Y8, Y13, Y13 + VPOR Y9, Y14, Y14 + VPXOR Y10, Y4, Y10 + VPXOR Y11, Y0, Y11 + VPXOR Y12, Y1, Y12 + VPXOR Y13, Y2, Y13 + VPXOR Y14, Y3, Y14 + VPXOR (AX), Y10, Y0 + VPXOR 352(AX), Y11, Y1 + VPXOR 704(AX), Y12, Y2 + VPXOR 256(AX), Y13, Y3 + VPXOR 608(AX), Y14, Y4 + VPSLLQ $0x2c, Y1, Y6 + VPSLLQ $0x2b, Y2, Y7 + VPSLLQ $0x15, Y3, Y8 + VPSLLQ $0x0e, Y4, Y9 + VPSRLQ $0x14, Y1, Y1 + VPSRLQ $0x15, Y2, Y2 + VPSRLQ $0x2b, Y3, Y3 + VPSRLQ $0x32, Y4, Y4 + VPOR Y6, Y1, Y1 + VPOR Y7, Y2, Y2 + VPOR Y8, Y3, Y3 + VPOR Y9, Y4, Y4 + VPANDN Y2, Y1, Y5 + VPANDN Y3, Y2, Y6 + VPANDN Y4, Y3, Y7 + VPANDN Y0, Y4, Y8 + VPANDN Y1, Y0, Y9 + VPXOR Y0, Y5, Y5 + VPXOR Y1, Y6, Y6 + VPXOR Y2, Y7, Y7 + VPXOR Y3, Y8, Y8 + VPXOR Y4, Y9, Y9 + VPBROADCASTQ 16(CX), Y0 + VPXOR Y0, Y5, Y5 + VMOVDQA Y5, (AX) + VMOVDQA Y6, 352(AX) + VMOVDQA Y7, 704(AX) + VMOVDQA Y8, 256(AX) + VMOVDQA Y9, 608(AX) + VPXOR 736(AX), Y13, Y0 + VPXOR 288(AX), Y14, Y1 + VPXOR 480(AX), Y10, Y2 + VPXOR 32(AX), Y11, Y3 + VPXOR 384(AX), Y12, Y4 + VPSLLQ $0x1c, Y0, Y5 + VPSLLQ $0x14, Y1, Y6 + VPSLLQ $0x03, Y2, Y7 + VPSLLQ $0x2d, Y3, Y8 + VPSLLQ $0x3d, Y4, Y9 + VPSRLQ $0x24, Y0, Y0 + VPSRLQ $0x2c, Y1, Y1 + VPSRLQ $0x3d, Y2, Y2 + VPSRLQ $0x13, Y3, Y3 + VPSRLQ $0x03, Y4, Y4 + VPOR Y5, Y0, Y0 + VPOR Y6, Y1, Y1 + VPOR Y7, Y2, Y2 + VPOR Y8, Y3, Y3 + VPOR Y9, Y4, Y4 + VPANDN Y2, Y1, Y5 + VPANDN Y3, Y2, Y6 + VPANDN Y4, Y3, Y7 + VPANDN Y0, Y4, Y8 + VPANDN Y1, Y0, Y9 + VPXOR Y0, Y5, Y5 + VPXOR Y1, Y6, Y6 + VPXOR Y2, Y7, Y7 + VPXOR Y3, Y8, Y8 + VPXOR Y4, Y9, Y9 + VMOVDQA Y5, 480(AX) + VMOVDQA Y6, 32(AX) + VMOVDQA Y7, 384(AX) + VMOVDQA Y8, 736(AX) + VMOVDQA Y9, 288(AX) + VPXOR 512(AX), Y11, Y0 + VPXOR 64(AX), Y12, Y1 + VPXOR 416(AX), Y13, Y2 + VPXOR 768(AX), Y14, Y3 + VPXOR 160(AX), Y10, Y4 + VPSLLQ $0x01, Y0, Y5 + VPSLLQ $0x06, Y1, Y6 + VPSLLQ $0x19, Y2, Y7 + VPSLLQ $0x08, Y3, Y8 + VPSLLQ $0x12, Y4, Y9 + VPSRLQ $0x3f, Y0, Y0 + VPSRLQ $0x3a, Y1, Y1 + VPSRLQ $0x27, Y2, Y2 + VPSRLQ $0x38, Y3, Y3 + VPSRLQ $0x2e, Y4, Y4 + VPOR Y5, Y0, Y0 + VPOR Y6, Y1, Y1 + VPOR Y7, Y2, Y2 + VPOR Y8, Y3, Y3 + VPOR Y9, Y4, Y4 + VPANDN Y2, Y1, Y5 + VPANDN Y3, Y2, Y6 + VPANDN Y4, Y3, Y7 + VPANDN Y0, Y4, Y8 + VPANDN Y1, Y0, Y9 + VPXOR Y0, Y5, Y5 + VPXOR Y1, Y6, Y6 + VPXOR Y2, Y7, Y7 + VPXOR Y3, Y8, Y8 + VPXOR Y4, Y9, Y9 + VMOVDQA Y5, 160(AX) + VMOVDQA Y6, 512(AX) + VMOVDQA Y7, 64(AX) + VMOVDQA Y8, 416(AX) + VMOVDQA Y9, 768(AX) + VPXOR 448(AX), Y14, Y0 + VPXOR 640(AX), Y10, Y1 + VPXOR 192(AX), Y11, Y2 + VPXOR 544(AX), Y12, Y3 + VPXOR 96(AX), Y13, Y4 + VPSLLQ $0x1b, Y0, Y5 + VPSLLQ $0x24, Y1, Y6 + VPSLLQ $0x0a, Y2, Y7 + VPSLLQ $0x0f, Y3, Y8 + VPSLLQ $0x38, Y4, Y9 + VPSRLQ $0x25, Y0, Y0 + VPSRLQ $0x1c, Y1, Y1 + VPSRLQ $0x36, Y2, Y2 + VPSRLQ $0x31, Y3, Y3 + VPSRLQ $0x08, Y4, Y4 + VPOR Y5, Y0, Y0 + VPOR Y6, Y1, Y1 + VPOR Y7, Y2, Y2 + VPOR Y8, Y3, Y3 + VPOR Y9, Y4, Y4 + VPANDN Y2, Y1, Y5 + VPANDN Y3, Y2, Y6 + VPANDN Y4, Y3, Y7 + VPANDN Y0, Y4, Y8 + VPANDN Y1, Y0, Y9 + VPXOR Y0, Y5, Y5 + VPXOR Y1, Y6, Y6 + VPXOR Y2, Y7, Y7 + VPXOR Y3, Y8, Y8 + VPXOR Y4, Y9, Y9 + VMOVDQA Y5, 640(AX) + VMOVDQA Y6, 192(AX) + VMOVDQA Y7, 544(AX) + VMOVDQA Y8, 96(AX) + VMOVDQA Y9, 448(AX) + VPXOR 224(AX), Y12, Y0 + VPXOR 576(AX), Y13, Y1 + VPXOR 128(AX), Y14, Y2 + VPXOR 320(AX), Y10, Y3 + VPXOR 672(AX), Y11, Y4 + VPSLLQ $0x3e, Y0, Y5 + VPSLLQ $0x37, Y1, Y6 + VPSLLQ $0x27, Y2, Y7 + VPSLLQ $0x29, Y3, Y8 + VPSLLQ $0x02, Y4, Y9 + VPSRLQ $0x02, Y0, Y0 + VPSRLQ $0x09, Y1, Y1 + VPSRLQ $0x19, Y2, Y2 + VPSRLQ $0x17, Y3, Y3 + VPSRLQ $0x3e, Y4, Y4 + VPOR Y5, Y0, Y0 + VPOR Y6, Y1, Y1 + VPOR Y7, Y2, Y2 + VPOR Y8, Y3, Y3 + VPOR Y9, Y4, Y4 + VPANDN Y2, Y1, Y5 + VPANDN Y3, Y2, Y6 + VPANDN Y4, Y3, Y7 + VPANDN Y0, Y4, Y8 + VPANDN Y1, Y0, Y9 + VPXOR Y0, Y5, Y5 + VPXOR Y1, Y6, Y6 + VPXOR Y2, Y7, Y7 + VPXOR Y3, Y8, Y8 + VPXOR Y4, Y9, Y9 + VMOVDQA Y5, 320(AX) + VMOVDQA Y6, 672(AX) + VMOVDQA Y7, 224(AX) + VMOVDQA Y8, 576(AX) + VMOVDQA Y9, 128(AX) + VMOVDQA (AX), Y0 + VMOVDQA 32(AX), Y1 + VMOVDQA 64(AX), Y2 + VMOVDQA 96(AX), Y3 + VMOVDQA 128(AX), Y4 + VPXOR 160(AX), Y0, Y0 + VPXOR 192(AX), Y1, Y1 + VPXOR 224(AX), Y2, Y2 + VPXOR 256(AX), Y3, Y3 + VPXOR 288(AX), Y4, Y4 + VPXOR 320(AX), Y0, Y0 + VPXOR 352(AX), Y1, Y1 + VPXOR 384(AX), Y2, Y2 + VPXOR 416(AX), Y3, Y3 + VPXOR 448(AX), Y4, Y4 + VPXOR 480(AX), Y0, Y0 + VPXOR 512(AX), Y1, Y1 + VPXOR 544(AX), Y2, Y2 + VPXOR 576(AX), Y3, Y3 + VPXOR 608(AX), Y4, Y4 + VPXOR 640(AX), Y0, Y0 + VPXOR 672(AX), Y1, Y1 + VPXOR 704(AX), Y2, Y2 + VPXOR 736(AX), Y3, Y3 + VPXOR 768(AX), Y4, Y4 + VPSLLQ $0x01, Y1, Y5 + VPSLLQ $0x01, Y2, Y6 + VPSLLQ $0x01, Y3, Y7 + VPSLLQ $0x01, Y4, Y8 + VPSLLQ $0x01, Y0, Y9 + VPSRLQ $0x3f, Y1, Y10 + VPSRLQ $0x3f, Y2, Y11 + VPSRLQ $0x3f, Y3, Y12 + VPSRLQ $0x3f, Y4, Y13 + VPSRLQ $0x3f, Y0, Y14 + VPOR Y5, Y10, Y10 + VPOR Y6, Y11, Y11 + VPOR Y7, Y12, Y12 + VPOR Y8, Y13, Y13 + VPOR Y9, Y14, Y14 + VPXOR Y10, Y4, Y10 + VPXOR Y11, Y0, Y11 + VPXOR Y12, Y1, Y12 + VPXOR Y13, Y2, Y13 + VPXOR Y14, Y3, Y14 + VPXOR (AX), Y10, Y0 + VPXOR 32(AX), Y11, Y1 + VPXOR 64(AX), Y12, Y2 + VPXOR 96(AX), Y13, Y3 + VPXOR 128(AX), Y14, Y4 + VPSLLQ $0x2c, Y1, Y6 + VPSLLQ $0x2b, Y2, Y7 + VPSLLQ $0x15, Y3, Y8 + VPSLLQ $0x0e, Y4, Y9 + VPSRLQ $0x14, Y1, Y1 + VPSRLQ $0x15, Y2, Y2 + VPSRLQ $0x2b, Y3, Y3 + VPSRLQ $0x32, Y4, Y4 + VPOR Y6, Y1, Y1 + VPOR Y7, Y2, Y2 + VPOR Y8, Y3, Y3 + VPOR Y9, Y4, Y4 + VPANDN Y2, Y1, Y5 + VPANDN Y3, Y2, Y6 + VPANDN Y4, Y3, Y7 + VPANDN Y0, Y4, Y8 + VPANDN Y1, Y0, Y9 + VPXOR Y0, Y5, Y5 + VPXOR Y1, Y6, Y6 + VPXOR Y2, Y7, Y7 + VPXOR Y3, Y8, Y8 + VPXOR Y4, Y9, Y9 + VPBROADCASTQ 24(CX), Y0 + VPXOR Y0, Y5, Y5 + VMOVDQA Y5, (AX) + VMOVDQA Y6, 32(AX) + VMOVDQA Y7, 64(AX) + VMOVDQA Y8, 96(AX) + VMOVDQA Y9, 128(AX) + VPXOR 256(AX), Y13, Y0 + VPXOR 288(AX), Y14, Y1 + VPXOR 160(AX), Y10, Y2 + VPXOR 192(AX), Y11, Y3 + VPXOR 224(AX), Y12, Y4 + VPSLLQ $0x1c, Y0, Y5 + VPSLLQ $0x14, Y1, Y6 + VPSLLQ $0x03, Y2, Y7 + VPSLLQ $0x2d, Y3, Y8 + VPSLLQ $0x3d, Y4, Y9 + VPSRLQ $0x24, Y0, Y0 + VPSRLQ $0x2c, Y1, Y1 + VPSRLQ $0x3d, Y2, Y2 + VPSRLQ $0x13, Y3, Y3 + VPSRLQ $0x03, Y4, Y4 + VPOR Y5, Y0, Y0 + VPOR Y6, Y1, Y1 + VPOR Y7, Y2, Y2 + VPOR Y8, Y3, Y3 + VPOR Y9, Y4, Y4 + VPANDN Y2, Y1, Y5 + VPANDN Y3, Y2, Y6 + VPANDN Y4, Y3, Y7 + VPANDN Y0, Y4, Y8 + VPANDN Y1, Y0, Y9 + VPXOR Y0, Y5, Y5 + VPXOR Y1, Y6, Y6 + VPXOR Y2, Y7, Y7 + VPXOR Y3, Y8, Y8 + VPXOR Y4, Y9, Y9 + VMOVDQA Y5, 160(AX) + VMOVDQA Y6, 192(AX) + VMOVDQA Y7, 224(AX) + VMOVDQA Y8, 256(AX) + VMOVDQA Y9, 288(AX) + VPXOR 352(AX), Y11, Y0 + VPXOR 384(AX), Y12, Y1 + VPXOR 416(AX), Y13, Y2 + VPXOR 448(AX), Y14, Y3 + VPXOR 320(AX), Y10, Y4 + VPSLLQ $0x01, Y0, Y5 + VPSLLQ $0x06, Y1, Y6 + VPSLLQ $0x19, Y2, Y7 + VPSLLQ $0x08, Y3, Y8 + VPSLLQ $0x12, Y4, Y9 + VPSRLQ $0x3f, Y0, Y0 + VPSRLQ $0x3a, Y1, Y1 + VPSRLQ $0x27, Y2, Y2 + VPSRLQ $0x38, Y3, Y3 + VPSRLQ $0x2e, Y4, Y4 + VPOR Y5, Y0, Y0 + VPOR Y6, Y1, Y1 + VPOR Y7, Y2, Y2 + VPOR Y8, Y3, Y3 + VPOR Y9, Y4, Y4 + VPANDN Y2, Y1, Y5 + VPANDN Y3, Y2, Y6 + VPANDN Y4, Y3, Y7 + VPANDN Y0, Y4, Y8 + VPANDN Y1, Y0, Y9 + VPXOR Y0, Y5, Y5 + VPXOR Y1, Y6, Y6 + VPXOR Y2, Y7, Y7 + VPXOR Y3, Y8, Y8 + VPXOR Y4, Y9, Y9 + VMOVDQA Y5, 320(AX) + VMOVDQA Y6, 352(AX) + VMOVDQA Y7, 384(AX) + VMOVDQA Y8, 416(AX) + VMOVDQA Y9, 448(AX) + VPXOR 608(AX), Y14, Y0 + VPXOR 480(AX), Y10, Y1 + VPXOR 512(AX), Y11, Y2 + VPXOR 544(AX), Y12, Y3 + VPXOR 576(AX), Y13, Y4 + VPSLLQ $0x1b, Y0, Y5 + VPSLLQ $0x24, Y1, Y6 + VPSLLQ $0x0a, Y2, Y7 + VPSLLQ $0x0f, Y3, Y8 + VPSLLQ $0x38, Y4, Y9 + VPSRLQ $0x25, Y0, Y0 + VPSRLQ $0x1c, Y1, Y1 + VPSRLQ $0x36, Y2, Y2 + VPSRLQ $0x31, Y3, Y3 + VPSRLQ $0x08, Y4, Y4 + VPOR Y5, Y0, Y0 + VPOR Y6, Y1, Y1 + VPOR Y7, Y2, Y2 + VPOR Y8, Y3, Y3 + VPOR Y9, Y4, Y4 + VPANDN Y2, Y1, Y5 + VPANDN Y3, Y2, Y6 + VPANDN Y4, Y3, Y7 + VPANDN Y0, Y4, Y8 + VPANDN Y1, Y0, Y9 + VPXOR Y0, Y5, Y5 + VPXOR Y1, Y6, Y6 + VPXOR Y2, Y7, Y7 + VPXOR Y3, Y8, Y8 + VPXOR Y4, Y9, Y9 + VMOVDQA Y5, 480(AX) + VMOVDQA Y6, 512(AX) + VMOVDQA Y7, 544(AX) + VMOVDQA Y8, 576(AX) + VMOVDQA Y9, 608(AX) + VPXOR 704(AX), Y12, Y0 + VPXOR 736(AX), Y13, Y1 + VPXOR 768(AX), Y14, Y2 + VPXOR 640(AX), Y10, Y3 + VPXOR 672(AX), Y11, Y4 + VPSLLQ $0x3e, Y0, Y5 + VPSLLQ $0x37, Y1, Y6 + VPSLLQ $0x27, Y2, Y7 + VPSLLQ $0x29, Y3, Y8 + VPSLLQ $0x02, Y4, Y9 + VPSRLQ $0x02, Y0, Y0 + VPSRLQ $0x09, Y1, Y1 + VPSRLQ $0x19, Y2, Y2 + VPSRLQ $0x17, Y3, Y3 + VPSRLQ $0x3e, Y4, Y4 + VPOR Y5, Y0, Y0 + VPOR Y6, Y1, Y1 + VPOR Y7, Y2, Y2 + VPOR Y8, Y3, Y3 + VPOR Y9, Y4, Y4 + VPANDN Y2, Y1, Y5 + VPANDN Y3, Y2, Y6 + VPANDN Y4, Y3, Y7 + VPANDN Y0, Y4, Y8 + VPANDN Y1, Y0, Y9 + VPXOR Y0, Y5, Y5 + VPXOR Y1, Y6, Y6 + VPXOR Y2, Y7, Y7 + VPXOR Y3, Y8, Y8 + VPXOR Y4, Y9, Y9 + VMOVDQA Y5, 640(AX) + VMOVDQA Y6, 672(AX) + VMOVDQA Y7, 704(AX) + VMOVDQA Y8, 736(AX) + VMOVDQA Y9, 768(AX) + ADDQ $0x20, CX + SUBQ $0x00000001, DX + JNZ loop + RET diff --git a/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x4stubs_amd64.go b/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x4stubs_amd64.go new file mode 100644 index 0000000000..102fdd04d1 --- /dev/null +++ b/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x4stubs_amd64.go @@ -0,0 +1,8 @@ +// Code generated by command: go run src.go -out ../../f1600x4_amd64.s -stubs ../../f1600x4stubs_amd64.go -pkg keccakf1600. DO NOT EDIT. + +//go:build amd64 && !purego + +package keccakf1600 + +//go:noescape +func f1600x4AVX2(state *uint64, rc *[24]uint64, turbo bool) diff --git a/vendor/github.com/cloudflare/circl/simd/keccakf1600/fallback.go b/vendor/github.com/cloudflare/circl/simd/keccakf1600/fallback.go new file mode 100644 index 0000000000..0da75e9b77 --- /dev/null +++ b/vendor/github.com/cloudflare/circl/simd/keccakf1600/fallback.go @@ -0,0 +1,8 @@ +//go:build (!amd64 && !arm64) || (arm64 && !go1.16) || purego +// +build !amd64,!arm64 arm64,!go1.16 purego + +package keccakf1600 + +func permuteSIMDx2(state []uint64, turbo bool) { permuteScalarX2(state, turbo) } + +func permuteSIMDx4(state []uint64, turbo bool) { permuteScalarX4(state, turbo) } diff --git a/vendor/modules.txt b/vendor/modules.txt index ad50f53486..427c123c5c 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -612,8 +612,8 @@ github.com/cloudevents/sdk-go/v2/event/datacodec/xml github.com/cloudevents/sdk-go/v2/protocol github.com/cloudevents/sdk-go/v2/protocol/http github.com/cloudevents/sdk-go/v2/types -# github.com/cloudflare/circl v1.3.7 -## explicit; go 1.19 +# github.com/cloudflare/circl v1.6.1 +## explicit; go 1.22.0 github.com/cloudflare/circl/dh/x25519 github.com/cloudflare/circl/dh/x448 github.com/cloudflare/circl/ecc/goldilocks @@ -626,6 +626,11 @@ github.com/cloudflare/circl/math/mlsbset github.com/cloudflare/circl/sign github.com/cloudflare/circl/sign/ed25519 github.com/cloudflare/circl/sign/ed448 +github.com/cloudflare/circl/sign/internal/dilithium +github.com/cloudflare/circl/sign/internal/dilithium/params +github.com/cloudflare/circl/sign/mldsa/mldsa65 +github.com/cloudflare/circl/sign/mldsa/mldsa65/internal +github.com/cloudflare/circl/simd/keccakf1600 # github.com/cncf/xds/go v0.0.0-20250326154945-ae57f3c0d45f ## explicit; go 1.19 github.com/cncf/xds/go/udpa/annotations