diff --git a/go.mod b/go.mod
index 5b7ff11519..3e2037f37f 100644
--- a/go.mod
+++ b/go.mod
@@ -7,6 +7,7 @@ toolchain go1.24.1
 require (
 	cloud.google.com/go/compute/metadata v0.7.0
 	cloud.google.com/go/storage v1.54.0
+	github.com/cloudflare/circl v1.6.1
 	github.com/fsnotify/fsnotify v1.9.0
 	github.com/golangci/golangci-lint v1.64.8
 	github.com/google/addlicense v1.1.1
@@ -164,7 +165,6 @@ require (
 	github.com/ckaznocha/intrange v0.3.0 // indirect
 	github.com/clbanning/mxj/v2 v2.7.0 // indirect
 	github.com/cloudevents/sdk-go/v2 v2.15.2 // indirect
-	github.com/cloudflare/circl v1.3.7 // indirect
 	github.com/cncf/xds/go v0.0.0-20250326154945-ae57f3c0d45f // indirect
 	github.com/common-nighthawk/go-figure v0.0.0-20210622060536-734e95fb86be // indirect
 	github.com/containerd/stargz-snapshotter/estargz v0.16.3 // indirect
diff --git a/go.sum b/go.sum
index 2cf9aea9a2..ee241181c2 100644
--- a/go.sum
+++ b/go.sum
@@ -378,8 +378,8 @@ github.com/clbanning/mxj/v2 v2.7.0/go.mod h1:hNiWqW14h+kc+MdF9C6/YoRfjEJoR3ou6tn
 github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
 github.com/cloudevents/sdk-go/v2 v2.15.2 h1:54+I5xQEnI73RBhWHxbI1XJcqOFOVJN85vb41+8mHUc=
 github.com/cloudevents/sdk-go/v2 v2.15.2/go.mod h1:lL7kSWAE/V8VI4Wh0jbL2v/jvqsm6tjmaQBSvxcv4uE=
-github.com/cloudflare/circl v1.3.7 h1:qlCDlTPz2n9fu58M0Nh1J/JzcFpfgkFHHX3O35r5vcU=
-github.com/cloudflare/circl v1.3.7/go.mod h1:sRTcRWXGLrKw6yIGJ+l7amYJFfAXbZG0kBSc8r4zxgA=
+github.com/cloudflare/circl v1.6.1 h1:zqIqSPIndyBh1bjLVVDHMPpVKqp8Su/V+6MeDzzQBQ0=
+github.com/cloudflare/circl v1.6.1/go.mod h1:uddAzsPgqdMAYatqJ0lsjX1oECcQLIlRpzZh3pJrofs=
 github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
 github.com/cncf/udpa/go v0.0.0-20200629203442-efcf912fb354/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk=
 github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk=
diff --git a/pkg/chains/signing/mldsa/mldsa.go b/pkg/chains/signing/mldsa/mldsa.go
new file mode 100644
index 0000000000..88a781f52b
--- /dev/null
+++ b/pkg/chains/signing/mldsa/mldsa.go
@@ -0,0 +1,141 @@
+/*
+Copyright 2024 The Tekton Authors
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package mldsa
+
+import (
+	"crypto"
+	"errors"
+	"io"
+
+	"github.com/cloudflare/circl/sign/mldsa/mldsa65"
+	"github.com/sigstore/sigstore/pkg/signature"
+)
+
+// SignerVerifier implements signature.SignerVerifier and crypto.Signer for MLDSA
+type SignerVerifier struct {
+	priv *mldsa65.PrivateKey
+	pub  *mldsa65.PublicKey
+}
+
+// LoadSignerVerifier creates a new SignerVerifier from a private key
+func LoadSignerVerifier(priv *mldsa65.PrivateKey) (*SignerVerifier, error) {
+	if priv == nil {
+		return nil, errors.New("private key cannot be nil")
+	}
+
+	// Get the public key from the private key
+	pub := priv.Public().(*mldsa65.PublicKey)
+
+	return &SignerVerifier{
+		priv: priv,
+		pub:  pub,
+	}, nil
+}
+
+// LoadVerifier creates a new SignerVerifier from a public key
+func LoadVerifier(pub *mldsa65.PublicKey) (*SignerVerifier, error) {
+	if pub == nil {
+		return nil, errors.New("public key cannot be nil")
+	}
+
+	return &SignerVerifier{
+		pub: pub,
+	}, nil
+}
+
+// Public implements crypto.Signer interface
+func (s *SignerVerifier) Public() crypto.PublicKey {
+	return s.pub
+}
+
+// Sign signs the given data
+func (s *SignerVerifier) Sign(data []byte) ([]byte, error) {
+	if s.priv == nil {
+		return nil, errors.New("private key not available for signing")
+	}
+
+	sig := make([]byte, mldsa65.SignatureSize)
+	err := mldsa65.SignTo(s.priv, data, nil, false, sig)
+	if err != nil {
+		return nil, err
+	}
+	return sig, nil
+}
+
+// SignWithOpts implements crypto.Signer interface
+func (s *SignerVerifier) SignWithOpts(rand io.Reader, digest []byte, opts crypto.SignerOpts) ([]byte, error) {
+	// MLDSA doesn't use pre-hashing, so we use the input directly
+	return s.Sign(digest)
+}
+
+// SignMessage signs a message from a reader
+func (s *SignerVerifier) SignMessage(message io.Reader, opts ...signature.SignOption) ([]byte, error) {
+	data, err := io.ReadAll(message)
+	if err != nil {
+		return nil, err
+	}
+
+	return s.Sign(data)
+}
+
+// Verify verifies the signature against the data
+func (s *SignerVerifier) Verify(data, sig []byte) error {
+	if s.pub == nil {
+		return errors.New("public key not available for verification")
+	}
+
+	if len(sig) != mldsa65.SignatureSize {
+		return errors.New("invalid signature size")
+	}
+
+	if !mldsa65.Verify(s.pub, data, nil, sig) {
+		return errors.New("invalid signature")
+	}
+
+	return nil
+}
+
+// VerifySignature verifies a signature from readers
+func (s *SignerVerifier) VerifySignature(signature, message io.Reader, opts ...signature.VerifyOption) error {
+	sig, err := io.ReadAll(signature)
+	if err != nil {
+		return err
+	}
+
+	data, err := io.ReadAll(message)
+	if err != nil {
+		return err
+	}
+
+	return s.Verify(data, sig)
+}
+
+// PublicKey returns the public key with optional parameters
+func (s *SignerVerifier) PublicKey(opts ...signature.PublicKeyOption) (crypto.PublicKey, error) {
+	return s.pub, nil
+}
+
+// Type returns the key type for SSH and other uses
+func (s *SignerVerifier) Type() string {
+	return "mldsa65-sha256"
+}
+
+// CreateKey generates a new key pair
+func (s *SignerVerifier) CreateKey(rand io.Reader) (crypto.PublicKey, crypto.PrivateKey, error) {
+	pub, priv, err := mldsa65.GenerateKey(rand)
+	if err != nil {
+		return nil, nil, err
+	}
+	return pub, priv, nil
+}
diff --git a/pkg/chains/signing/wrap.go b/pkg/chains/signing/wrap.go
index ee032b1321..8ac7ba2e2c 100644
--- a/pkg/chains/signing/wrap.go
+++ b/pkg/chains/signing/wrap.go
@@ -17,10 +17,13 @@ import (
 	"bytes"
 	"context"
 	"crypto"
+	"crypto/sha256"
+	"encoding/base64"
 	"encoding/json"
 	"fmt"
 	"io"
 
+	"github.com/cloudflare/circl/sign/mldsa/mldsa65"
 	"github.com/in-toto/in-toto-golang/in_toto"
 	"github.com/secure-systems-lab/go-securesystemslib/dsse"
 	"github.com/sigstore/sigstore/pkg/signature"
@@ -34,23 +37,40 @@ func Wrap(s Signer) (Signer, error) {
 		return nil, err
 	}
 
-	// Generate public key fingerprint
-	sshpk, err := ssh.NewPublicKey(pub)
-	if err != nil {
-		return nil, err
+	var fingerprint string
+	var pk crypto.PublicKey
+
+	// Handle MLDSA keys differently
+	if mldsaPub, ok := pub.(*mldsa65.PublicKey); ok {
+		// Generate fingerprint from MLDSA public key bytes
+		pkBytes, err := mldsaPub.MarshalBinary()
+		if err != nil {
+			return nil, fmt.Errorf("failed to marshal MLDSA public key: %w", err)
+		}
+		hash := sha256.Sum256(pkBytes)
+		fingerprint = "SHA256:" + base64.StdEncoding.EncodeToString(hash[:])
+		pk = pub
+	} else {
+		// For other key types, use SSH public key
+		sshpk, err := ssh.NewPublicKey(pub)
+		if err != nil {
+			return nil, err
+		}
+		fingerprint = ssh.FingerprintSHA256(sshpk)
+		pk = sshpk
 	}
-	fingerprint := ssh.FingerprintSHA256(sshpk)
 
 	adapter := sslAdapter{
 		wrapped: s,
 		keyID:   fingerprint,
-		pk:      sshpk,
+		pk:      pk,
 	}
 
 	envelope, err := dsse.NewEnvelopeSigner(&adapter)
 	if err != nil {
 		return nil, err
 	}
+
 	return &sslSigner{
 		wrapper: envelope,
 		typ:     s.Type(),
diff --git a/pkg/chains/signing/x509/x509.go b/pkg/chains/signing/x509/x509.go
index 82e76070ed..6ed0b5ac4d 100644
--- a/pkg/chains/signing/x509/x509.go
+++ b/pkg/chains/signing/x509/x509.go
@@ -18,6 +18,7 @@ import (
 	"crypto"
 	"crypto/ecdsa"
 	cx509 "crypto/x509"
+	"encoding/asn1"
 	"encoding/json"
 	"encoding/pem"
 	"fmt"
@@ -34,9 +35,11 @@ import (
 	"github.com/sigstore/cosign/v2/pkg/providers"
 	"knative.dev/pkg/logging"
 
+	"github.com/cloudflare/circl/sign/mldsa/mldsa65"
 	"github.com/sigstore/sigstore/pkg/signature"
 	"github.com/sigstore/sigstore/pkg/tuf"
 	"github.com/tektoncd/chains/pkg/chains/signing"
+	"github.com/tektoncd/chains/pkg/chains/signing/mldsa"
 	"github.com/tektoncd/chains/pkg/config"
 )
 
@@ -44,6 +47,20 @@ const (
 	defaultOIDCClientID = "sigstore"
 )
 
+// MLDSA65 OID: 2.16.840.1.101.3.4.3.18
+var mldsaOID = asn1.ObjectIdentifier{2, 16, 840, 1, 101, 3, 4, 3, 18}
+
+type pkcs8 struct {
+	Version    int
+	Algorithm  pkcs8Algorithm
+	PrivateKey []byte
+}
+
+type pkcs8Algorithm struct {
+	Algorithm  asn1.ObjectIdentifier
+	Parameters asn1.RawValue `asn1:"optional"`
+}
+
 // Signer exposes methods to sign payloads.
 type Signer struct {
 	cert  string
@@ -175,23 +192,82 @@ func loadRootFromURL(root string) ([]byte, error) {
 	return io.ReadAll(resp.Body)
 }
 
+func extractMLDSAFromPKCS8(der []byte) (*mldsa65.PrivateKey, error) {
+	// PKCS#8 structure typically has the raw key at the end
+	// For MLDSA65, we need exactly mldsa65.PrivateKeySize bytes
+
+	if len(der) < mldsa65.PrivateKeySize {
+		return nil, fmt.Errorf("PKCS#8 data too short: %d bytes, need at least %d",
+			len(der), mldsa65.PrivateKeySize)
+	}
+
+	// Strategy 1: Try the last PrivateKeySize bytes (most common case)
+	if len(der) >= mldsa65.PrivateKeySize {
+		rawKey := der[len(der)-mldsa65.PrivateKeySize:]
+		var mldsaKey mldsa65.PrivateKey
+		if err := mldsaKey.UnmarshalBinary(rawKey); err == nil {
+			return &mldsaKey, nil
+		}
+	}
+	return nil, fmt.Errorf("no valid MLDSA key found in PKCS#8 data")
+}
+
+func tryLoadMLDSA(data []byte) (*mldsa65.PrivateKey, error) {
+	// First try direct raw format
+	var mldsaKey mldsa65.PrivateKey
+	if err := mldsaKey.UnmarshalBinary(data); err == nil {
+		return &mldsaKey, nil
+	}
+
+	// Then try to extract from PKCS#8
+	if key, err := extractMLDSAFromPKCS8(data); err == nil {
+		return key, nil
+	}
+
+	return nil, fmt.Errorf("data is neither raw MLDSA key nor PKCS#8 wrapped MLDSA key")
+}
+
 func x509Signer(ctx context.Context, privateKey []byte) (*Signer, error) {
 	logger := logging.FromContext(ctx)
 	logger.Info("Found x509 key...")
 
 	p, _ := pem.Decode(privateKey)
-	if p.Type != "PRIVATE KEY" {
-		return nil, fmt.Errorf("expected private key, found object of type %s", p.Type)
-	}
-	pk, err := cx509.ParsePKCS8PrivateKey(p.Bytes)
-	if err != nil {
-		return nil, err
+	if p == nil {
+		return nil, fmt.Errorf("failed to decode PEM block")
 	}
-	signer, err := signature.LoadECDSASignerVerifier(pk.(*ecdsa.PrivateKey), crypto.SHA256)
-	if err != nil {
-		return nil, err
+
+	logger.Infof("Attempting to parse private key of type: %s", p.Type)
+
+	switch p.Type {
+	case "PRIVATE KEY":
+		// Try PKCS#8 first for ECDSA keys
+		if pk, err := cx509.ParsePKCS8PrivateKey(p.Bytes); err == nil {
+			if ecKey, ok := pk.(*ecdsa.PrivateKey); ok {
+				logger.Info("Using ECDSA private key...")
+				signer, err := signature.LoadECDSASignerVerifier(ecKey, crypto.SHA256)
+				if err != nil {
+					return nil, fmt.Errorf("failed to load ECDSA signer: %w", err)
+				}
+				return &Signer{SignerVerifier: signer}, nil
+			}
+		}
+
+		// Try MLDSA formats
+		if mldsaKey, err := tryLoadMLDSA(p.Bytes); err == nil {
+			logger.Info("Using MLDSA private key...")
+			signer, err := mldsa.LoadSignerVerifier(mldsaKey)
+			if err != nil {
+				return nil, fmt.Errorf("failed to load MLDSA signer: %w", err)
+			}
+			return &Signer{SignerVerifier: signer}, nil
+		} else {
+			logger.Infof("Failed to load MLDSA key: %v", err)
+		}
+
+		return nil, fmt.Errorf("unsupported private key format - key could not be parsed as PKCS#8 ECDSA or MLDSA")
+	default:
+		return nil, fmt.Errorf("expected private key, found object of type %s", p.Type)
 	}
-	return &Signer{SignerVerifier: signer}, nil
 }
 
 func cosignSigner(ctx context.Context, secretPath string, privateKey []byte) (*Signer, error) {
diff --git a/vendor/github.com/cloudflare/circl/dh/x25519/curve_amd64.s b/vendor/github.com/cloudflare/circl/dh/x25519/curve_amd64.s
index b7723185b6..ce9f062894 100644
--- a/vendor/github.com/cloudflare/circl/dh/x25519/curve_amd64.s
+++ b/vendor/github.com/cloudflare/circl/dh/x25519/curve_amd64.s
@@ -1,4 +1,5 @@
-// +build amd64
+//go:build amd64 && !purego
+// +build amd64,!purego
 
 #include "textflag.h"
 
diff --git a/vendor/github.com/cloudflare/circl/dh/x448/curve_amd64.s b/vendor/github.com/cloudflare/circl/dh/x448/curve_amd64.s
index 810aa9e648..ed33ba3d03 100644
--- a/vendor/github.com/cloudflare/circl/dh/x448/curve_amd64.s
+++ b/vendor/github.com/cloudflare/circl/dh/x448/curve_amd64.s
@@ -1,4 +1,5 @@
-// +build amd64
+//go:build amd64 && !purego
+// +build amd64,!purego
 
 #include "textflag.h"
 
diff --git a/vendor/github.com/cloudflare/circl/ecc/goldilocks/curve.go b/vendor/github.com/cloudflare/circl/ecc/goldilocks/curve.go
index 5a939100d2..1f165141a9 100644
--- a/vendor/github.com/cloudflare/circl/ecc/goldilocks/curve.go
+++ b/vendor/github.com/cloudflare/circl/ecc/goldilocks/curve.go
@@ -18,6 +18,9 @@ func (Curve) Identity() *Point {
 func (Curve) IsOnCurve(P *Point) bool {
 	x2, y2, t, t2, z2 := &fp.Elt{}, &fp.Elt{}, &fp.Elt{}, &fp.Elt{}, &fp.Elt{}
 	rhs, lhs := &fp.Elt{}, &fp.Elt{}
+	// Check z != 0
+	eq0 := !fp.IsZero(&P.z)
+
 	fp.Mul(t, &P.ta, &P.tb)  // t = ta*tb
 	fp.Sqr(x2, &P.x)         // x^2
 	fp.Sqr(y2, &P.y)         // y^2
@@ -27,13 +30,14 @@ func (Curve) IsOnCurve(P *Point) bool {
 	fp.Mul(rhs, t2, &paramD) // dt^2
 	fp.Add(rhs, rhs, z2)     // z^2 + dt^2
 	fp.Sub(lhs, lhs, rhs)    // x^2 + y^2 - (z^2 + dt^2)
-	eq0 := fp.IsZero(lhs)
+	eq1 := fp.IsZero(lhs)
 
 	fp.Mul(lhs, &P.x, &P.y) // xy
 	fp.Mul(rhs, t, &P.z)    // tz
 	fp.Sub(lhs, lhs, rhs)   // xy - tz
-	eq1 := fp.IsZero(lhs)
-	return eq0 && eq1
+	eq2 := fp.IsZero(lhs)
+
+	return eq0 && eq1 && eq2
 }
 
 // Generator returns the generator point.
diff --git a/vendor/github.com/cloudflare/circl/internal/conv/conv.go b/vendor/github.com/cloudflare/circl/internal/conv/conv.go
index 649a8e931d..3fd0df496f 100644
--- a/vendor/github.com/cloudflare/circl/internal/conv/conv.go
+++ b/vendor/github.com/cloudflare/circl/internal/conv/conv.go
@@ -5,6 +5,8 @@ import (
 	"fmt"
 	"math/big"
 	"strings"
+
+	"golang.org/x/crypto/cryptobyte"
 )
 
 // BytesLe2Hex returns an hexadecimal string of a number stored in a
@@ -138,3 +140,34 @@ func BigInt2Uint64Le(z []uint64, x *big.Int) {
 		z[i] = 0
 	}
 }
+
+// MarshalBinary encodes a value into a byte array in a format readable by UnmarshalBinary.
+func MarshalBinary(v cryptobyte.MarshalingValue) ([]byte, error) {
+	const DefaultSize = 32
+	b := cryptobyte.NewBuilder(make([]byte, 0, DefaultSize))
+	b.AddValue(v)
+	return b.Bytes()
+}
+
+// MarshalBinaryLen encodes a value into an array of n bytes in a format readable by UnmarshalBinary.
+func MarshalBinaryLen(v cryptobyte.MarshalingValue, length uint) ([]byte, error) {
+	b := cryptobyte.NewFixedBuilder(make([]byte, 0, length))
+	b.AddValue(v)
+	return b.Bytes()
+}
+
+// A UnmarshalingValue decodes itself from a cryptobyte.String and advances the pointer.
+// It reports whether the read was successful.
+type UnmarshalingValue interface {
+	Unmarshal(*cryptobyte.String) bool
+}
+
+// UnmarshalBinary recovers a value from a byte array.
+// It returns an error if the read was unsuccessful.
+func UnmarshalBinary(v UnmarshalingValue, data []byte) (err error) {
+	s := cryptobyte.String(data)
+	if data == nil || !v.Unmarshal(&s) || !s.Empty() {
+		err = fmt.Errorf("cannot read %T from input string", v)
+	}
+	return
+}
diff --git a/vendor/github.com/cloudflare/circl/math/fp25519/fp_amd64.s b/vendor/github.com/cloudflare/circl/math/fp25519/fp_amd64.s
index 5c4aeddecb..1fcc2dee17 100644
--- a/vendor/github.com/cloudflare/circl/math/fp25519/fp_amd64.s
+++ b/vendor/github.com/cloudflare/circl/math/fp25519/fp_amd64.s
@@ -1,4 +1,5 @@
-// +build amd64
+//go:build amd64 && !purego
+// +build amd64,!purego
 
 #include "textflag.h"
 #include "fp_amd64.h"
diff --git a/vendor/github.com/cloudflare/circl/math/fp448/fp_amd64.s b/vendor/github.com/cloudflare/circl/math/fp448/fp_amd64.s
index 435addf5e6..3f1f07c986 100644
--- a/vendor/github.com/cloudflare/circl/math/fp448/fp_amd64.s
+++ b/vendor/github.com/cloudflare/circl/math/fp448/fp_amd64.s
@@ -1,4 +1,5 @@
-// +build amd64
+//go:build amd64 && !purego
+// +build amd64,!purego
 
 #include "textflag.h"
 #include "fp_amd64.h"
diff --git a/vendor/github.com/cloudflare/circl/math/integer.go b/vendor/github.com/cloudflare/circl/math/integer.go
new file mode 100644
index 0000000000..9c80c23b59
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/math/integer.go
@@ -0,0 +1,16 @@
+package math
+
+import "math/bits"
+
+// NextPow2 finds the next power of two (N=2^k, k>=0) greater than n.
+// If n is already a power of two, then this function returns n, and log2(n).
+func NextPow2(n uint) (N uint, k uint) {
+	if bits.OnesCount(n) == 1 {
+		k = uint(bits.TrailingZeros(n))
+		N = n
+	} else {
+		k = uint(bits.Len(n))
+		N = uint(1) << k
+	}
+	return
+}
diff --git a/vendor/github.com/cloudflare/circl/sign/ed25519/point.go b/vendor/github.com/cloudflare/circl/sign/ed25519/point.go
index 374a69503c..d1c3b146b7 100644
--- a/vendor/github.com/cloudflare/circl/sign/ed25519/point.go
+++ b/vendor/github.com/cloudflare/circl/sign/ed25519/point.go
@@ -164,7 +164,7 @@ func (P *pointR1) isEqual(Q *pointR1) bool {
 	fp.Mul(r, r, &P.z)
 	fp.Sub(l, l, r)
 	b = b && fp.IsZero(l)
-	return b
+	return b && !fp.IsZero(&P.z) && !fp.IsZero(&Q.z)
 }
 
 func (P *pointR3) neg() {
diff --git a/vendor/github.com/cloudflare/circl/sign/ed448/ed448.go b/vendor/github.com/cloudflare/circl/sign/ed448/ed448.go
index 324bd8f334..c368b181b4 100644
--- a/vendor/github.com/cloudflare/circl/sign/ed448/ed448.go
+++ b/vendor/github.com/cloudflare/circl/sign/ed448/ed448.go
@@ -206,7 +206,7 @@ func newKeyFromSeed(privateKey, seed []byte) {
 
 func signAll(signature []byte, privateKey PrivateKey, message, ctx []byte, preHash bool) {
 	if len(ctx) > ContextMaxSize {
-		panic(fmt.Errorf("ed448: bad context length: " + strconv.Itoa(len(ctx))))
+		panic(fmt.Errorf("ed448: bad context length: %v", len(ctx)))
 	}
 
 	H := sha3.NewShake256()
diff --git a/vendor/github.com/cloudflare/circl/sign/internal/dilithium/amd64.go b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/amd64.go
new file mode 100644
index 0000000000..d5d224ee84
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/amd64.go
@@ -0,0 +1,154 @@
+//go:build amd64 && !purego
+// +build amd64,!purego
+
+package dilithium
+
+import (
+	"golang.org/x/sys/cpu"
+)
+
+// Execute an in-place forward NTT on as.
+//
+// Assumes the coefficients are in Montgomery representation and bounded
+// by 2*Q.  The resulting coefficients are again in Montgomery representation,
+// but are only bounded bt 18*Q.
+func (p *Poly) NTT() {
+	if cpu.X86.HasAVX2 {
+		nttAVX2(
+			(*[N]uint32)(p),
+		)
+	} else {
+		p.nttGeneric()
+	}
+}
+
+// Execute an in-place inverse NTT and multiply by Montgomery factor R
+//
+// Assumes the coefficients are in Montgomery representation and bounded
+// by 2*Q.  The resulting coefficients are again in Montgomery representation
+// and bounded by 2*Q.
+func (p *Poly) InvNTT() {
+	if cpu.X86.HasAVX2 {
+		invNttAVX2(
+			(*[N]uint32)(p),
+		)
+	} else {
+		p.invNttGeneric()
+	}
+}
+
+// Sets p to the polynomial whose coefficients are the pointwise multiplication
+// of those of a and b.  The coefficients of p are bounded by 2q.
+//
+// Assumes a and b are in Montgomery form and that the pointwise product
+// of each coefficient is below 2³² q.
+func (p *Poly) MulHat(a, b *Poly) {
+	if cpu.X86.HasAVX2 {
+		mulHatAVX2(
+			(*[N]uint32)(p),
+			(*[N]uint32)(a),
+			(*[N]uint32)(b),
+		)
+	} else {
+		p.mulHatGeneric(a, b)
+	}
+}
+
+// Sets p to a + b.  Does not normalize polynomials.
+func (p *Poly) Add(a, b *Poly) {
+	if cpu.X86.HasAVX2 {
+		addAVX2(
+			(*[N]uint32)(p),
+			(*[N]uint32)(a),
+			(*[N]uint32)(b),
+		)
+	} else {
+		p.addGeneric(a, b)
+	}
+}
+
+// Sets p to a - b.
+//
+// Warning: assumes coefficients of b are less than 2q.
+// Sets p to a + b.  Does not normalize polynomials.
+func (p *Poly) Sub(a, b *Poly) {
+	if cpu.X86.HasAVX2 {
+		subAVX2(
+			(*[N]uint32)(p),
+			(*[N]uint32)(a),
+			(*[N]uint32)(b),
+		)
+	} else {
+		p.subGeneric(a, b)
+	}
+}
+
+// Writes p whose coefficients are in [0, 16) to buf, which must be of
+// length N/2.
+func (p *Poly) PackLe16(buf []byte) {
+	if cpu.X86.HasAVX2 {
+		if len(buf) < PolyLe16Size {
+			panic("buf too small")
+		}
+		packLe16AVX2(
+			(*[N]uint32)(p),
+			&buf[0],
+		)
+	} else {
+		p.packLe16Generic(buf)
+	}
+}
+
+// Reduces each of the coefficients to <2q.
+func (p *Poly) ReduceLe2Q() {
+	if cpu.X86.HasAVX2 {
+		reduceLe2QAVX2((*[N]uint32)(p))
+	} else {
+		p.reduceLe2QGeneric()
+	}
+}
+
+// Reduce each of the coefficients to <q.
+func (p *Poly) Normalize() {
+	if cpu.X86.HasAVX2 {
+		p.ReduceLe2Q()
+		p.NormalizeAssumingLe2Q()
+	} else {
+		p.normalizeGeneric()
+	}
+}
+
+// Normalize the coefficients in this polynomial assuming they are already
+// bounded by 2q.
+func (p *Poly) NormalizeAssumingLe2Q() {
+	if cpu.X86.HasAVX2 {
+		le2qModQAVX2((*[N]uint32)(p))
+	} else {
+		p.normalizeAssumingLe2QGeneric()
+	}
+}
+
+// Checks whether the "supnorm" (see sec 2.1 of the spec) of p is equal
+// or greater than the given bound.
+//
+// Requires the coefficients of p to be normalized.
+func (p *Poly) Exceeds(bound uint32) bool {
+	if cpu.X86.HasAVX2 {
+		return exceedsAVX2((*[N]uint32)(p), bound) == 1
+	}
+	return p.exceedsGeneric(bound)
+}
+
+// Sets p to 2ᵈ q without reducing.
+//
+// So it requires the coefficients of p  to be less than 2³²⁻ᴰ.
+func (p *Poly) MulBy2toD(q *Poly) {
+	if cpu.X86.HasAVX2 {
+		mulBy2toDAVX2(
+			(*[N]uint32)(p),
+			(*[N]uint32)(q),
+		)
+	} else {
+		p.mulBy2toDGeneric(q)
+	}
+}
diff --git a/vendor/github.com/cloudflare/circl/sign/internal/dilithium/amd64.s b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/amd64.s
new file mode 100644
index 0000000000..94180fb734
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/amd64.s
@@ -0,0 +1,8407 @@
+// Code generated by command: go run src.go -out ../amd64.s -stubs ../stubs_amd64.go -pkg common. DO NOT EDIT.
+
+//go:build amd64 && !purego
+
+#include "textflag.h"
+
+// func nttAVX2(p *[256]uint32)
+// Requires: AVX, AVX2
+TEXT ·nttAVX2(SB), $2080-8
+	MOVQ         p+0(FP), AX
+	LEAQ         ·Zetas+0(SB), CX
+	LEAQ         (SP), DX
+	MOVQ         $0xffffffffffffffe0, BX
+	ANDQ         BX, DX
+	MOVL         $0x007fe001, BX
+	VMOVD        BX, X0
+	VPBROADCASTD X0, Y0
+	MOVL         $0x00ffc002, BX
+	VMOVD        BX, X1
+	VPBROADCASTD X1, Y1
+	MOVL         $0xfc7fdfff, BX
+	VMOVD        BX, X2
+	VPBROADCASTD X2, Y2
+	VPMOVZXDQ    (AX), Y7
+	VPMOVZXDQ    128(AX), Y8
+	VPMOVZXDQ    256(AX), Y9
+	VPMOVZXDQ    384(AX), Y10
+	VPMOVZXDQ    512(AX), Y11
+	VPMOVZXDQ    640(AX), Y12
+	VPMOVZXDQ    768(AX), Y13
+	VPMOVZXDQ    896(AX), Y14
+	VPBROADCASTD 4(CX), Y3
+	VPMULUDQ     Y11, Y3, Y11
+	VPMULUDQ     Y12, Y3, Y12
+	VPMULUDQ     Y13, Y3, Y13
+	VPMULUDQ     Y14, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y11, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y11
+	VPADDD       Y8, Y1, Y12
+	VPADDD       Y9, Y1, Y13
+	VPADDD       Y10, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y9, Y9
+	VPADDD       Y6, Y10, Y10
+	VPSUBD       Y3, Y11, Y11
+	VPSUBD       Y4, Y12, Y12
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 8(CX), Y3
+	VPBROADCASTD 12(CX), Y4
+	VPMULUDQ     Y9, Y3, Y9
+	VPMULUDQ     Y10, Y3, Y10
+	VPMULUDQ     Y13, Y4, Y13
+	VPMULUDQ     Y14, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y9, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y9
+	VPADDD       Y8, Y1, Y10
+	VPADDD       Y11, Y1, Y13
+	VPADDD       Y12, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y12, Y12
+	VPSUBD       Y3, Y9, Y9
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 16(CX), Y3
+	VPBROADCASTD 20(CX), Y4
+	VPBROADCASTD 24(CX), Y5
+	VPBROADCASTD 28(CX), Y6
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VMOVDQA      Y7, (DX)
+	VMOVDQA      Y8, 256(DX)
+	VMOVDQA      Y9, 512(DX)
+	VMOVDQA      Y10, 768(DX)
+	VMOVDQA      Y11, 1024(DX)
+	VMOVDQA      Y12, 1280(DX)
+	VMOVDQA      Y13, 1536(DX)
+	VMOVDQA      Y14, 1792(DX)
+	VPMOVZXDQ    16(AX), Y7
+	VPMOVZXDQ    144(AX), Y8
+	VPMOVZXDQ    272(AX), Y9
+	VPMOVZXDQ    400(AX), Y10
+	VPMOVZXDQ    528(AX), Y11
+	VPMOVZXDQ    656(AX), Y12
+	VPMOVZXDQ    784(AX), Y13
+	VPMOVZXDQ    912(AX), Y14
+	VPBROADCASTD 4(CX), Y3
+	VPMULUDQ     Y11, Y3, Y11
+	VPMULUDQ     Y12, Y3, Y12
+	VPMULUDQ     Y13, Y3, Y13
+	VPMULUDQ     Y14, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y11, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y11
+	VPADDD       Y8, Y1, Y12
+	VPADDD       Y9, Y1, Y13
+	VPADDD       Y10, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y9, Y9
+	VPADDD       Y6, Y10, Y10
+	VPSUBD       Y3, Y11, Y11
+	VPSUBD       Y4, Y12, Y12
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 8(CX), Y3
+	VPBROADCASTD 12(CX), Y4
+	VPMULUDQ     Y9, Y3, Y9
+	VPMULUDQ     Y10, Y3, Y10
+	VPMULUDQ     Y13, Y4, Y13
+	VPMULUDQ     Y14, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y9, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y9
+	VPADDD       Y8, Y1, Y10
+	VPADDD       Y11, Y1, Y13
+	VPADDD       Y12, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y12, Y12
+	VPSUBD       Y3, Y9, Y9
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 16(CX), Y3
+	VPBROADCASTD 20(CX), Y4
+	VPBROADCASTD 24(CX), Y5
+	VPBROADCASTD 28(CX), Y6
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VMOVDQA      Y7, 32(DX)
+	VMOVDQA      Y8, 288(DX)
+	VMOVDQA      Y9, 544(DX)
+	VMOVDQA      Y10, 800(DX)
+	VMOVDQA      Y11, 1056(DX)
+	VMOVDQA      Y12, 1312(DX)
+	VMOVDQA      Y13, 1568(DX)
+	VMOVDQA      Y14, 1824(DX)
+	VPMOVZXDQ    32(AX), Y7
+	VPMOVZXDQ    160(AX), Y8
+	VPMOVZXDQ    288(AX), Y9
+	VPMOVZXDQ    416(AX), Y10
+	VPMOVZXDQ    544(AX), Y11
+	VPMOVZXDQ    672(AX), Y12
+	VPMOVZXDQ    800(AX), Y13
+	VPMOVZXDQ    928(AX), Y14
+	VPBROADCASTD 4(CX), Y3
+	VPMULUDQ     Y11, Y3, Y11
+	VPMULUDQ     Y12, Y3, Y12
+	VPMULUDQ     Y13, Y3, Y13
+	VPMULUDQ     Y14, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y11, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y11
+	VPADDD       Y8, Y1, Y12
+	VPADDD       Y9, Y1, Y13
+	VPADDD       Y10, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y9, Y9
+	VPADDD       Y6, Y10, Y10
+	VPSUBD       Y3, Y11, Y11
+	VPSUBD       Y4, Y12, Y12
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 8(CX), Y3
+	VPBROADCASTD 12(CX), Y4
+	VPMULUDQ     Y9, Y3, Y9
+	VPMULUDQ     Y10, Y3, Y10
+	VPMULUDQ     Y13, Y4, Y13
+	VPMULUDQ     Y14, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y9, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y9
+	VPADDD       Y8, Y1, Y10
+	VPADDD       Y11, Y1, Y13
+	VPADDD       Y12, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y12, Y12
+	VPSUBD       Y3, Y9, Y9
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 16(CX), Y3
+	VPBROADCASTD 20(CX), Y4
+	VPBROADCASTD 24(CX), Y5
+	VPBROADCASTD 28(CX), Y6
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VMOVDQA      Y7, 64(DX)
+	VMOVDQA      Y8, 320(DX)
+	VMOVDQA      Y9, 576(DX)
+	VMOVDQA      Y10, 832(DX)
+	VMOVDQA      Y11, 1088(DX)
+	VMOVDQA      Y12, 1344(DX)
+	VMOVDQA      Y13, 1600(DX)
+	VMOVDQA      Y14, 1856(DX)
+	VPMOVZXDQ    48(AX), Y7
+	VPMOVZXDQ    176(AX), Y8
+	VPMOVZXDQ    304(AX), Y9
+	VPMOVZXDQ    432(AX), Y10
+	VPMOVZXDQ    560(AX), Y11
+	VPMOVZXDQ    688(AX), Y12
+	VPMOVZXDQ    816(AX), Y13
+	VPMOVZXDQ    944(AX), Y14
+	VPBROADCASTD 4(CX), Y3
+	VPMULUDQ     Y11, Y3, Y11
+	VPMULUDQ     Y12, Y3, Y12
+	VPMULUDQ     Y13, Y3, Y13
+	VPMULUDQ     Y14, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y11, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y11
+	VPADDD       Y8, Y1, Y12
+	VPADDD       Y9, Y1, Y13
+	VPADDD       Y10, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y9, Y9
+	VPADDD       Y6, Y10, Y10
+	VPSUBD       Y3, Y11, Y11
+	VPSUBD       Y4, Y12, Y12
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 8(CX), Y3
+	VPBROADCASTD 12(CX), Y4
+	VPMULUDQ     Y9, Y3, Y9
+	VPMULUDQ     Y10, Y3, Y10
+	VPMULUDQ     Y13, Y4, Y13
+	VPMULUDQ     Y14, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y9, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y9
+	VPADDD       Y8, Y1, Y10
+	VPADDD       Y11, Y1, Y13
+	VPADDD       Y12, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y12, Y12
+	VPSUBD       Y3, Y9, Y9
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 16(CX), Y3
+	VPBROADCASTD 20(CX), Y4
+	VPBROADCASTD 24(CX), Y5
+	VPBROADCASTD 28(CX), Y6
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VMOVDQA      Y7, 96(DX)
+	VMOVDQA      Y8, 352(DX)
+	VMOVDQA      Y9, 608(DX)
+	VMOVDQA      Y10, 864(DX)
+	VMOVDQA      Y11, 1120(DX)
+	VMOVDQA      Y12, 1376(DX)
+	VMOVDQA      Y13, 1632(DX)
+	VMOVDQA      Y14, 1888(DX)
+	VPMOVZXDQ    64(AX), Y7
+	VPMOVZXDQ    192(AX), Y8
+	VPMOVZXDQ    320(AX), Y9
+	VPMOVZXDQ    448(AX), Y10
+	VPMOVZXDQ    576(AX), Y11
+	VPMOVZXDQ    704(AX), Y12
+	VPMOVZXDQ    832(AX), Y13
+	VPMOVZXDQ    960(AX), Y14
+	VPBROADCASTD 4(CX), Y3
+	VPMULUDQ     Y11, Y3, Y11
+	VPMULUDQ     Y12, Y3, Y12
+	VPMULUDQ     Y13, Y3, Y13
+	VPMULUDQ     Y14, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y11, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y11
+	VPADDD       Y8, Y1, Y12
+	VPADDD       Y9, Y1, Y13
+	VPADDD       Y10, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y9, Y9
+	VPADDD       Y6, Y10, Y10
+	VPSUBD       Y3, Y11, Y11
+	VPSUBD       Y4, Y12, Y12
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 8(CX), Y3
+	VPBROADCASTD 12(CX), Y4
+	VPMULUDQ     Y9, Y3, Y9
+	VPMULUDQ     Y10, Y3, Y10
+	VPMULUDQ     Y13, Y4, Y13
+	VPMULUDQ     Y14, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y9, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y9
+	VPADDD       Y8, Y1, Y10
+	VPADDD       Y11, Y1, Y13
+	VPADDD       Y12, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y12, Y12
+	VPSUBD       Y3, Y9, Y9
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 16(CX), Y3
+	VPBROADCASTD 20(CX), Y4
+	VPBROADCASTD 24(CX), Y5
+	VPBROADCASTD 28(CX), Y6
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VMOVDQA      Y7, 128(DX)
+	VMOVDQA      Y8, 384(DX)
+	VMOVDQA      Y9, 640(DX)
+	VMOVDQA      Y10, 896(DX)
+	VMOVDQA      Y11, 1152(DX)
+	VMOVDQA      Y12, 1408(DX)
+	VMOVDQA      Y13, 1664(DX)
+	VMOVDQA      Y14, 1920(DX)
+	VPMOVZXDQ    80(AX), Y7
+	VPMOVZXDQ    208(AX), Y8
+	VPMOVZXDQ    336(AX), Y9
+	VPMOVZXDQ    464(AX), Y10
+	VPMOVZXDQ    592(AX), Y11
+	VPMOVZXDQ    720(AX), Y12
+	VPMOVZXDQ    848(AX), Y13
+	VPMOVZXDQ    976(AX), Y14
+	VPBROADCASTD 4(CX), Y3
+	VPMULUDQ     Y11, Y3, Y11
+	VPMULUDQ     Y12, Y3, Y12
+	VPMULUDQ     Y13, Y3, Y13
+	VPMULUDQ     Y14, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y11, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y11
+	VPADDD       Y8, Y1, Y12
+	VPADDD       Y9, Y1, Y13
+	VPADDD       Y10, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y9, Y9
+	VPADDD       Y6, Y10, Y10
+	VPSUBD       Y3, Y11, Y11
+	VPSUBD       Y4, Y12, Y12
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 8(CX), Y3
+	VPBROADCASTD 12(CX), Y4
+	VPMULUDQ     Y9, Y3, Y9
+	VPMULUDQ     Y10, Y3, Y10
+	VPMULUDQ     Y13, Y4, Y13
+	VPMULUDQ     Y14, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y9, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y9
+	VPADDD       Y8, Y1, Y10
+	VPADDD       Y11, Y1, Y13
+	VPADDD       Y12, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y12, Y12
+	VPSUBD       Y3, Y9, Y9
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 16(CX), Y3
+	VPBROADCASTD 20(CX), Y4
+	VPBROADCASTD 24(CX), Y5
+	VPBROADCASTD 28(CX), Y6
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VMOVDQA      Y7, 160(DX)
+	VMOVDQA      Y8, 416(DX)
+	VMOVDQA      Y9, 672(DX)
+	VMOVDQA      Y10, 928(DX)
+	VMOVDQA      Y11, 1184(DX)
+	VMOVDQA      Y12, 1440(DX)
+	VMOVDQA      Y13, 1696(DX)
+	VMOVDQA      Y14, 1952(DX)
+	VPMOVZXDQ    96(AX), Y7
+	VPMOVZXDQ    224(AX), Y8
+	VPMOVZXDQ    352(AX), Y9
+	VPMOVZXDQ    480(AX), Y10
+	VPMOVZXDQ    608(AX), Y11
+	VPMOVZXDQ    736(AX), Y12
+	VPMOVZXDQ    864(AX), Y13
+	VPMOVZXDQ    992(AX), Y14
+	VPBROADCASTD 4(CX), Y3
+	VPMULUDQ     Y11, Y3, Y11
+	VPMULUDQ     Y12, Y3, Y12
+	VPMULUDQ     Y13, Y3, Y13
+	VPMULUDQ     Y14, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y11, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y11
+	VPADDD       Y8, Y1, Y12
+	VPADDD       Y9, Y1, Y13
+	VPADDD       Y10, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y9, Y9
+	VPADDD       Y6, Y10, Y10
+	VPSUBD       Y3, Y11, Y11
+	VPSUBD       Y4, Y12, Y12
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 8(CX), Y3
+	VPBROADCASTD 12(CX), Y4
+	VPMULUDQ     Y9, Y3, Y9
+	VPMULUDQ     Y10, Y3, Y10
+	VPMULUDQ     Y13, Y4, Y13
+	VPMULUDQ     Y14, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y9, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y9
+	VPADDD       Y8, Y1, Y10
+	VPADDD       Y11, Y1, Y13
+	VPADDD       Y12, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y12, Y12
+	VPSUBD       Y3, Y9, Y9
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 16(CX), Y3
+	VPBROADCASTD 20(CX), Y4
+	VPBROADCASTD 24(CX), Y5
+	VPBROADCASTD 28(CX), Y6
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VMOVDQA      Y7, 192(DX)
+	VMOVDQA      Y8, 448(DX)
+	VMOVDQA      Y9, 704(DX)
+	VMOVDQA      Y10, 960(DX)
+	VMOVDQA      Y11, 1216(DX)
+	VMOVDQA      Y12, 1472(DX)
+	VMOVDQA      Y13, 1728(DX)
+	VMOVDQA      Y14, 1984(DX)
+	VPMOVZXDQ    112(AX), Y7
+	VPMOVZXDQ    240(AX), Y8
+	VPMOVZXDQ    368(AX), Y9
+	VPMOVZXDQ    496(AX), Y10
+	VPMOVZXDQ    624(AX), Y11
+	VPMOVZXDQ    752(AX), Y12
+	VPMOVZXDQ    880(AX), Y13
+	VPMOVZXDQ    1008(AX), Y14
+	VPBROADCASTD 4(CX), Y3
+	VPMULUDQ     Y11, Y3, Y11
+	VPMULUDQ     Y12, Y3, Y12
+	VPMULUDQ     Y13, Y3, Y13
+	VPMULUDQ     Y14, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y11, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y11
+	VPADDD       Y8, Y1, Y12
+	VPADDD       Y9, Y1, Y13
+	VPADDD       Y10, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y9, Y9
+	VPADDD       Y6, Y10, Y10
+	VPSUBD       Y3, Y11, Y11
+	VPSUBD       Y4, Y12, Y12
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 8(CX), Y3
+	VPBROADCASTD 12(CX), Y4
+	VPMULUDQ     Y9, Y3, Y9
+	VPMULUDQ     Y10, Y3, Y10
+	VPMULUDQ     Y13, Y4, Y13
+	VPMULUDQ     Y14, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y9, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y9
+	VPADDD       Y8, Y1, Y10
+	VPADDD       Y11, Y1, Y13
+	VPADDD       Y12, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y12, Y12
+	VPSUBD       Y3, Y9, Y9
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 16(CX), Y3
+	VPBROADCASTD 20(CX), Y4
+	VPBROADCASTD 24(CX), Y5
+	VPBROADCASTD 28(CX), Y6
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VMOVDQA      Y7, 224(DX)
+	VMOVDQA      Y8, 480(DX)
+	VMOVDQA      Y9, 736(DX)
+	VMOVDQA      Y10, 992(DX)
+	VMOVDQA      Y11, 1248(DX)
+	VMOVDQA      Y12, 1504(DX)
+	VMOVDQA      Y13, 1760(DX)
+	VMOVDQA      Y14, 2016(DX)
+	VMOVDQA      (DX), Y7
+	VMOVDQA      32(DX), Y8
+	VMOVDQA      64(DX), Y9
+	VMOVDQA      96(DX), Y10
+	VMOVDQA      128(DX), Y11
+	VMOVDQA      160(DX), Y12
+	VMOVDQA      192(DX), Y13
+	VMOVDQA      224(DX), Y14
+	VPBROADCASTD 32(CX), Y3
+	VPMULUDQ     Y11, Y3, Y11
+	VPMULUDQ     Y12, Y3, Y12
+	VPMULUDQ     Y13, Y3, Y13
+	VPMULUDQ     Y14, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y11, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y11
+	VPADDD       Y8, Y1, Y12
+	VPADDD       Y9, Y1, Y13
+	VPADDD       Y10, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y9, Y9
+	VPADDD       Y6, Y10, Y10
+	VPSUBD       Y3, Y11, Y11
+	VPSUBD       Y4, Y12, Y12
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 64(CX), Y3
+	VPBROADCASTD 68(CX), Y4
+	VPMULUDQ     Y9, Y3, Y9
+	VPMULUDQ     Y10, Y3, Y10
+	VPMULUDQ     Y13, Y4, Y13
+	VPMULUDQ     Y14, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y9, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y9
+	VPADDD       Y8, Y1, Y10
+	VPADDD       Y11, Y1, Y13
+	VPADDD       Y12, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y12, Y12
+	VPSUBD       Y3, Y9, Y9
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 128(CX), Y3
+	VPBROADCASTD 132(CX), Y4
+	VPBROADCASTD 136(CX), Y5
+	VPBROADCASTD 140(CX), Y6
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 256(CX), Y15
+	VPBROADCASTD 260(CX), Y3
+	VPBLENDD     $0xf0, Y3, Y15, Y3
+	VPBROADCASTD 264(CX), Y15
+	VPBROADCASTD 268(CX), Y4
+	VPBLENDD     $0xf0, Y4, Y15, Y4
+	VPBROADCASTD 272(CX), Y15
+	VPBROADCASTD 276(CX), Y5
+	VPBLENDD     $0xf0, Y5, Y15, Y5
+	VPBROADCASTD 280(CX), Y15
+	VPBROADCASTD 284(CX), Y6
+	VPBLENDD     $0xf0, Y6, Y15, Y6
+	VPERM2I128   $0x20, Y8, Y7, Y15
+	VPERM2I128   $0x31, Y8, Y7, Y8
+	VMOVDQA      Y15, Y7
+	VPERM2I128   $0x20, Y10, Y9, Y15
+	VPERM2I128   $0x31, Y10, Y9, Y10
+	VMOVDQA      Y15, Y9
+	VPERM2I128   $0x20, Y12, Y11, Y15
+	VPERM2I128   $0x31, Y12, Y11, Y12
+	VMOVDQA      Y15, Y11
+	VPERM2I128   $0x20, Y14, Y13, Y15
+	VPERM2I128   $0x31, Y14, Y13, Y14
+	VMOVDQA      Y15, Y13
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VPUNPCKLQDQ  Y8, Y7, Y3
+	VPUNPCKHQDQ  Y8, Y7, Y8
+	VMOVDQA      Y3, Y7
+	VPUNPCKLQDQ  Y10, Y9, Y3
+	VPUNPCKHQDQ  Y10, Y9, Y10
+	VMOVDQA      Y3, Y9
+	VPUNPCKLQDQ  Y12, Y11, Y3
+	VPUNPCKHQDQ  Y12, Y11, Y12
+	VMOVDQA      Y3, Y11
+	VPUNPCKLQDQ  Y14, Y13, Y3
+	VPUNPCKHQDQ  Y14, Y13, Y14
+	VMOVDQA      Y3, Y13
+	VPMOVZXDQ    512(CX), Y3
+	VPMOVZXDQ    528(CX), Y4
+	VPMOVZXDQ    544(CX), Y5
+	VPMOVZXDQ    560(CX), Y6
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VPSLLQ       $0x20, Y8, Y8
+	VPSLLQ       $0x20, Y10, Y10
+	VPSLLQ       $0x20, Y12, Y12
+	VPSLLQ       $0x20, Y14, Y14
+	VPBLENDD     $0xaa, Y8, Y7, Y7
+	VPBLENDD     $0xaa, Y10, Y9, Y9
+	VPBLENDD     $0xaa, Y12, Y11, Y11
+	VPBLENDD     $0xaa, Y14, Y13, Y13
+	VMOVDQU      Y7, (AX)
+	VMOVDQU      Y9, 32(AX)
+	VMOVDQU      Y11, 64(AX)
+	VMOVDQU      Y13, 96(AX)
+	VMOVDQA      256(DX), Y7
+	VMOVDQA      288(DX), Y8
+	VMOVDQA      320(DX), Y9
+	VMOVDQA      352(DX), Y10
+	VMOVDQA      384(DX), Y11
+	VMOVDQA      416(DX), Y12
+	VMOVDQA      448(DX), Y13
+	VMOVDQA      480(DX), Y14
+	VPBROADCASTD 36(CX), Y3
+	VPMULUDQ     Y11, Y3, Y11
+	VPMULUDQ     Y12, Y3, Y12
+	VPMULUDQ     Y13, Y3, Y13
+	VPMULUDQ     Y14, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y11, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y11
+	VPADDD       Y8, Y1, Y12
+	VPADDD       Y9, Y1, Y13
+	VPADDD       Y10, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y9, Y9
+	VPADDD       Y6, Y10, Y10
+	VPSUBD       Y3, Y11, Y11
+	VPSUBD       Y4, Y12, Y12
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 72(CX), Y3
+	VPBROADCASTD 76(CX), Y4
+	VPMULUDQ     Y9, Y3, Y9
+	VPMULUDQ     Y10, Y3, Y10
+	VPMULUDQ     Y13, Y4, Y13
+	VPMULUDQ     Y14, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y9, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y9
+	VPADDD       Y8, Y1, Y10
+	VPADDD       Y11, Y1, Y13
+	VPADDD       Y12, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y12, Y12
+	VPSUBD       Y3, Y9, Y9
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 144(CX), Y3
+	VPBROADCASTD 148(CX), Y4
+	VPBROADCASTD 152(CX), Y5
+	VPBROADCASTD 156(CX), Y6
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 288(CX), Y15
+	VPBROADCASTD 292(CX), Y3
+	VPBLENDD     $0xf0, Y3, Y15, Y3
+	VPBROADCASTD 296(CX), Y15
+	VPBROADCASTD 300(CX), Y4
+	VPBLENDD     $0xf0, Y4, Y15, Y4
+	VPBROADCASTD 304(CX), Y15
+	VPBROADCASTD 308(CX), Y5
+	VPBLENDD     $0xf0, Y5, Y15, Y5
+	VPBROADCASTD 312(CX), Y15
+	VPBROADCASTD 316(CX), Y6
+	VPBLENDD     $0xf0, Y6, Y15, Y6
+	VPERM2I128   $0x20, Y8, Y7, Y15
+	VPERM2I128   $0x31, Y8, Y7, Y8
+	VMOVDQA      Y15, Y7
+	VPERM2I128   $0x20, Y10, Y9, Y15
+	VPERM2I128   $0x31, Y10, Y9, Y10
+	VMOVDQA      Y15, Y9
+	VPERM2I128   $0x20, Y12, Y11, Y15
+	VPERM2I128   $0x31, Y12, Y11, Y12
+	VMOVDQA      Y15, Y11
+	VPERM2I128   $0x20, Y14, Y13, Y15
+	VPERM2I128   $0x31, Y14, Y13, Y14
+	VMOVDQA      Y15, Y13
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VPUNPCKLQDQ  Y8, Y7, Y3
+	VPUNPCKHQDQ  Y8, Y7, Y8
+	VMOVDQA      Y3, Y7
+	VPUNPCKLQDQ  Y10, Y9, Y3
+	VPUNPCKHQDQ  Y10, Y9, Y10
+	VMOVDQA      Y3, Y9
+	VPUNPCKLQDQ  Y12, Y11, Y3
+	VPUNPCKHQDQ  Y12, Y11, Y12
+	VMOVDQA      Y3, Y11
+	VPUNPCKLQDQ  Y14, Y13, Y3
+	VPUNPCKHQDQ  Y14, Y13, Y14
+	VMOVDQA      Y3, Y13
+	VPMOVZXDQ    576(CX), Y3
+	VPMOVZXDQ    592(CX), Y4
+	VPMOVZXDQ    608(CX), Y5
+	VPMOVZXDQ    624(CX), Y6
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VPSLLQ       $0x20, Y8, Y8
+	VPSLLQ       $0x20, Y10, Y10
+	VPSLLQ       $0x20, Y12, Y12
+	VPSLLQ       $0x20, Y14, Y14
+	VPBLENDD     $0xaa, Y8, Y7, Y7
+	VPBLENDD     $0xaa, Y10, Y9, Y9
+	VPBLENDD     $0xaa, Y12, Y11, Y11
+	VPBLENDD     $0xaa, Y14, Y13, Y13
+	VMOVDQU      Y7, 128(AX)
+	VMOVDQU      Y9, 160(AX)
+	VMOVDQU      Y11, 192(AX)
+	VMOVDQU      Y13, 224(AX)
+	VMOVDQA      512(DX), Y7
+	VMOVDQA      544(DX), Y8
+	VMOVDQA      576(DX), Y9
+	VMOVDQA      608(DX), Y10
+	VMOVDQA      640(DX), Y11
+	VMOVDQA      672(DX), Y12
+	VMOVDQA      704(DX), Y13
+	VMOVDQA      736(DX), Y14
+	VPBROADCASTD 40(CX), Y3
+	VPMULUDQ     Y11, Y3, Y11
+	VPMULUDQ     Y12, Y3, Y12
+	VPMULUDQ     Y13, Y3, Y13
+	VPMULUDQ     Y14, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y11, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y11
+	VPADDD       Y8, Y1, Y12
+	VPADDD       Y9, Y1, Y13
+	VPADDD       Y10, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y9, Y9
+	VPADDD       Y6, Y10, Y10
+	VPSUBD       Y3, Y11, Y11
+	VPSUBD       Y4, Y12, Y12
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 80(CX), Y3
+	VPBROADCASTD 84(CX), Y4
+	VPMULUDQ     Y9, Y3, Y9
+	VPMULUDQ     Y10, Y3, Y10
+	VPMULUDQ     Y13, Y4, Y13
+	VPMULUDQ     Y14, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y9, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y9
+	VPADDD       Y8, Y1, Y10
+	VPADDD       Y11, Y1, Y13
+	VPADDD       Y12, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y12, Y12
+	VPSUBD       Y3, Y9, Y9
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 160(CX), Y3
+	VPBROADCASTD 164(CX), Y4
+	VPBROADCASTD 168(CX), Y5
+	VPBROADCASTD 172(CX), Y6
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 320(CX), Y15
+	VPBROADCASTD 324(CX), Y3
+	VPBLENDD     $0xf0, Y3, Y15, Y3
+	VPBROADCASTD 328(CX), Y15
+	VPBROADCASTD 332(CX), Y4
+	VPBLENDD     $0xf0, Y4, Y15, Y4
+	VPBROADCASTD 336(CX), Y15
+	VPBROADCASTD 340(CX), Y5
+	VPBLENDD     $0xf0, Y5, Y15, Y5
+	VPBROADCASTD 344(CX), Y15
+	VPBROADCASTD 348(CX), Y6
+	VPBLENDD     $0xf0, Y6, Y15, Y6
+	VPERM2I128   $0x20, Y8, Y7, Y15
+	VPERM2I128   $0x31, Y8, Y7, Y8
+	VMOVDQA      Y15, Y7
+	VPERM2I128   $0x20, Y10, Y9, Y15
+	VPERM2I128   $0x31, Y10, Y9, Y10
+	VMOVDQA      Y15, Y9
+	VPERM2I128   $0x20, Y12, Y11, Y15
+	VPERM2I128   $0x31, Y12, Y11, Y12
+	VMOVDQA      Y15, Y11
+	VPERM2I128   $0x20, Y14, Y13, Y15
+	VPERM2I128   $0x31, Y14, Y13, Y14
+	VMOVDQA      Y15, Y13
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VPUNPCKLQDQ  Y8, Y7, Y3
+	VPUNPCKHQDQ  Y8, Y7, Y8
+	VMOVDQA      Y3, Y7
+	VPUNPCKLQDQ  Y10, Y9, Y3
+	VPUNPCKHQDQ  Y10, Y9, Y10
+	VMOVDQA      Y3, Y9
+	VPUNPCKLQDQ  Y12, Y11, Y3
+	VPUNPCKHQDQ  Y12, Y11, Y12
+	VMOVDQA      Y3, Y11
+	VPUNPCKLQDQ  Y14, Y13, Y3
+	VPUNPCKHQDQ  Y14, Y13, Y14
+	VMOVDQA      Y3, Y13
+	VPMOVZXDQ    640(CX), Y3
+	VPMOVZXDQ    656(CX), Y4
+	VPMOVZXDQ    672(CX), Y5
+	VPMOVZXDQ    688(CX), Y6
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VPSLLQ       $0x20, Y8, Y8
+	VPSLLQ       $0x20, Y10, Y10
+	VPSLLQ       $0x20, Y12, Y12
+	VPSLLQ       $0x20, Y14, Y14
+	VPBLENDD     $0xaa, Y8, Y7, Y7
+	VPBLENDD     $0xaa, Y10, Y9, Y9
+	VPBLENDD     $0xaa, Y12, Y11, Y11
+	VPBLENDD     $0xaa, Y14, Y13, Y13
+	VMOVDQU      Y7, 256(AX)
+	VMOVDQU      Y9, 288(AX)
+	VMOVDQU      Y11, 320(AX)
+	VMOVDQU      Y13, 352(AX)
+	VMOVDQA      768(DX), Y7
+	VMOVDQA      800(DX), Y8
+	VMOVDQA      832(DX), Y9
+	VMOVDQA      864(DX), Y10
+	VMOVDQA      896(DX), Y11
+	VMOVDQA      928(DX), Y12
+	VMOVDQA      960(DX), Y13
+	VMOVDQA      992(DX), Y14
+	VPBROADCASTD 44(CX), Y3
+	VPMULUDQ     Y11, Y3, Y11
+	VPMULUDQ     Y12, Y3, Y12
+	VPMULUDQ     Y13, Y3, Y13
+	VPMULUDQ     Y14, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y11, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y11
+	VPADDD       Y8, Y1, Y12
+	VPADDD       Y9, Y1, Y13
+	VPADDD       Y10, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y9, Y9
+	VPADDD       Y6, Y10, Y10
+	VPSUBD       Y3, Y11, Y11
+	VPSUBD       Y4, Y12, Y12
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 88(CX), Y3
+	VPBROADCASTD 92(CX), Y4
+	VPMULUDQ     Y9, Y3, Y9
+	VPMULUDQ     Y10, Y3, Y10
+	VPMULUDQ     Y13, Y4, Y13
+	VPMULUDQ     Y14, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y9, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y9
+	VPADDD       Y8, Y1, Y10
+	VPADDD       Y11, Y1, Y13
+	VPADDD       Y12, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y12, Y12
+	VPSUBD       Y3, Y9, Y9
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 176(CX), Y3
+	VPBROADCASTD 180(CX), Y4
+	VPBROADCASTD 184(CX), Y5
+	VPBROADCASTD 188(CX), Y6
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 352(CX), Y15
+	VPBROADCASTD 356(CX), Y3
+	VPBLENDD     $0xf0, Y3, Y15, Y3
+	VPBROADCASTD 360(CX), Y15
+	VPBROADCASTD 364(CX), Y4
+	VPBLENDD     $0xf0, Y4, Y15, Y4
+	VPBROADCASTD 368(CX), Y15
+	VPBROADCASTD 372(CX), Y5
+	VPBLENDD     $0xf0, Y5, Y15, Y5
+	VPBROADCASTD 376(CX), Y15
+	VPBROADCASTD 380(CX), Y6
+	VPBLENDD     $0xf0, Y6, Y15, Y6
+	VPERM2I128   $0x20, Y8, Y7, Y15
+	VPERM2I128   $0x31, Y8, Y7, Y8
+	VMOVDQA      Y15, Y7
+	VPERM2I128   $0x20, Y10, Y9, Y15
+	VPERM2I128   $0x31, Y10, Y9, Y10
+	VMOVDQA      Y15, Y9
+	VPERM2I128   $0x20, Y12, Y11, Y15
+	VPERM2I128   $0x31, Y12, Y11, Y12
+	VMOVDQA      Y15, Y11
+	VPERM2I128   $0x20, Y14, Y13, Y15
+	VPERM2I128   $0x31, Y14, Y13, Y14
+	VMOVDQA      Y15, Y13
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VPUNPCKLQDQ  Y8, Y7, Y3
+	VPUNPCKHQDQ  Y8, Y7, Y8
+	VMOVDQA      Y3, Y7
+	VPUNPCKLQDQ  Y10, Y9, Y3
+	VPUNPCKHQDQ  Y10, Y9, Y10
+	VMOVDQA      Y3, Y9
+	VPUNPCKLQDQ  Y12, Y11, Y3
+	VPUNPCKHQDQ  Y12, Y11, Y12
+	VMOVDQA      Y3, Y11
+	VPUNPCKLQDQ  Y14, Y13, Y3
+	VPUNPCKHQDQ  Y14, Y13, Y14
+	VMOVDQA      Y3, Y13
+	VPMOVZXDQ    704(CX), Y3
+	VPMOVZXDQ    720(CX), Y4
+	VPMOVZXDQ    736(CX), Y5
+	VPMOVZXDQ    752(CX), Y6
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VPSLLQ       $0x20, Y8, Y8
+	VPSLLQ       $0x20, Y10, Y10
+	VPSLLQ       $0x20, Y12, Y12
+	VPSLLQ       $0x20, Y14, Y14
+	VPBLENDD     $0xaa, Y8, Y7, Y7
+	VPBLENDD     $0xaa, Y10, Y9, Y9
+	VPBLENDD     $0xaa, Y12, Y11, Y11
+	VPBLENDD     $0xaa, Y14, Y13, Y13
+	VMOVDQU      Y7, 384(AX)
+	VMOVDQU      Y9, 416(AX)
+	VMOVDQU      Y11, 448(AX)
+	VMOVDQU      Y13, 480(AX)
+	VMOVDQA      1024(DX), Y7
+	VMOVDQA      1056(DX), Y8
+	VMOVDQA      1088(DX), Y9
+	VMOVDQA      1120(DX), Y10
+	VMOVDQA      1152(DX), Y11
+	VMOVDQA      1184(DX), Y12
+	VMOVDQA      1216(DX), Y13
+	VMOVDQA      1248(DX), Y14
+	VPBROADCASTD 48(CX), Y3
+	VPMULUDQ     Y11, Y3, Y11
+	VPMULUDQ     Y12, Y3, Y12
+	VPMULUDQ     Y13, Y3, Y13
+	VPMULUDQ     Y14, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y11, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y11
+	VPADDD       Y8, Y1, Y12
+	VPADDD       Y9, Y1, Y13
+	VPADDD       Y10, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y9, Y9
+	VPADDD       Y6, Y10, Y10
+	VPSUBD       Y3, Y11, Y11
+	VPSUBD       Y4, Y12, Y12
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 96(CX), Y3
+	VPBROADCASTD 100(CX), Y4
+	VPMULUDQ     Y9, Y3, Y9
+	VPMULUDQ     Y10, Y3, Y10
+	VPMULUDQ     Y13, Y4, Y13
+	VPMULUDQ     Y14, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y9, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y9
+	VPADDD       Y8, Y1, Y10
+	VPADDD       Y11, Y1, Y13
+	VPADDD       Y12, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y12, Y12
+	VPSUBD       Y3, Y9, Y9
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 192(CX), Y3
+	VPBROADCASTD 196(CX), Y4
+	VPBROADCASTD 200(CX), Y5
+	VPBROADCASTD 204(CX), Y6
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 384(CX), Y15
+	VPBROADCASTD 388(CX), Y3
+	VPBLENDD     $0xf0, Y3, Y15, Y3
+	VPBROADCASTD 392(CX), Y15
+	VPBROADCASTD 396(CX), Y4
+	VPBLENDD     $0xf0, Y4, Y15, Y4
+	VPBROADCASTD 400(CX), Y15
+	VPBROADCASTD 404(CX), Y5
+	VPBLENDD     $0xf0, Y5, Y15, Y5
+	VPBROADCASTD 408(CX), Y15
+	VPBROADCASTD 412(CX), Y6
+	VPBLENDD     $0xf0, Y6, Y15, Y6
+	VPERM2I128   $0x20, Y8, Y7, Y15
+	VPERM2I128   $0x31, Y8, Y7, Y8
+	VMOVDQA      Y15, Y7
+	VPERM2I128   $0x20, Y10, Y9, Y15
+	VPERM2I128   $0x31, Y10, Y9, Y10
+	VMOVDQA      Y15, Y9
+	VPERM2I128   $0x20, Y12, Y11, Y15
+	VPERM2I128   $0x31, Y12, Y11, Y12
+	VMOVDQA      Y15, Y11
+	VPERM2I128   $0x20, Y14, Y13, Y15
+	VPERM2I128   $0x31, Y14, Y13, Y14
+	VMOVDQA      Y15, Y13
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VPUNPCKLQDQ  Y8, Y7, Y3
+	VPUNPCKHQDQ  Y8, Y7, Y8
+	VMOVDQA      Y3, Y7
+	VPUNPCKLQDQ  Y10, Y9, Y3
+	VPUNPCKHQDQ  Y10, Y9, Y10
+	VMOVDQA      Y3, Y9
+	VPUNPCKLQDQ  Y12, Y11, Y3
+	VPUNPCKHQDQ  Y12, Y11, Y12
+	VMOVDQA      Y3, Y11
+	VPUNPCKLQDQ  Y14, Y13, Y3
+	VPUNPCKHQDQ  Y14, Y13, Y14
+	VMOVDQA      Y3, Y13
+	VPMOVZXDQ    768(CX), Y3
+	VPMOVZXDQ    784(CX), Y4
+	VPMOVZXDQ    800(CX), Y5
+	VPMOVZXDQ    816(CX), Y6
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VPSLLQ       $0x20, Y8, Y8
+	VPSLLQ       $0x20, Y10, Y10
+	VPSLLQ       $0x20, Y12, Y12
+	VPSLLQ       $0x20, Y14, Y14
+	VPBLENDD     $0xaa, Y8, Y7, Y7
+	VPBLENDD     $0xaa, Y10, Y9, Y9
+	VPBLENDD     $0xaa, Y12, Y11, Y11
+	VPBLENDD     $0xaa, Y14, Y13, Y13
+	VMOVDQU      Y7, 512(AX)
+	VMOVDQU      Y9, 544(AX)
+	VMOVDQU      Y11, 576(AX)
+	VMOVDQU      Y13, 608(AX)
+	VMOVDQA      1280(DX), Y7
+	VMOVDQA      1312(DX), Y8
+	VMOVDQA      1344(DX), Y9
+	VMOVDQA      1376(DX), Y10
+	VMOVDQA      1408(DX), Y11
+	VMOVDQA      1440(DX), Y12
+	VMOVDQA      1472(DX), Y13
+	VMOVDQA      1504(DX), Y14
+	VPBROADCASTD 52(CX), Y3
+	VPMULUDQ     Y11, Y3, Y11
+	VPMULUDQ     Y12, Y3, Y12
+	VPMULUDQ     Y13, Y3, Y13
+	VPMULUDQ     Y14, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y11, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y11
+	VPADDD       Y8, Y1, Y12
+	VPADDD       Y9, Y1, Y13
+	VPADDD       Y10, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y9, Y9
+	VPADDD       Y6, Y10, Y10
+	VPSUBD       Y3, Y11, Y11
+	VPSUBD       Y4, Y12, Y12
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 104(CX), Y3
+	VPBROADCASTD 108(CX), Y4
+	VPMULUDQ     Y9, Y3, Y9
+	VPMULUDQ     Y10, Y3, Y10
+	VPMULUDQ     Y13, Y4, Y13
+	VPMULUDQ     Y14, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y9, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y9
+	VPADDD       Y8, Y1, Y10
+	VPADDD       Y11, Y1, Y13
+	VPADDD       Y12, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y12, Y12
+	VPSUBD       Y3, Y9, Y9
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 208(CX), Y3
+	VPBROADCASTD 212(CX), Y4
+	VPBROADCASTD 216(CX), Y5
+	VPBROADCASTD 220(CX), Y6
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 416(CX), Y15
+	VPBROADCASTD 420(CX), Y3
+	VPBLENDD     $0xf0, Y3, Y15, Y3
+	VPBROADCASTD 424(CX), Y15
+	VPBROADCASTD 428(CX), Y4
+	VPBLENDD     $0xf0, Y4, Y15, Y4
+	VPBROADCASTD 432(CX), Y15
+	VPBROADCASTD 436(CX), Y5
+	VPBLENDD     $0xf0, Y5, Y15, Y5
+	VPBROADCASTD 440(CX), Y15
+	VPBROADCASTD 444(CX), Y6
+	VPBLENDD     $0xf0, Y6, Y15, Y6
+	VPERM2I128   $0x20, Y8, Y7, Y15
+	VPERM2I128   $0x31, Y8, Y7, Y8
+	VMOVDQA      Y15, Y7
+	VPERM2I128   $0x20, Y10, Y9, Y15
+	VPERM2I128   $0x31, Y10, Y9, Y10
+	VMOVDQA      Y15, Y9
+	VPERM2I128   $0x20, Y12, Y11, Y15
+	VPERM2I128   $0x31, Y12, Y11, Y12
+	VMOVDQA      Y15, Y11
+	VPERM2I128   $0x20, Y14, Y13, Y15
+	VPERM2I128   $0x31, Y14, Y13, Y14
+	VMOVDQA      Y15, Y13
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VPUNPCKLQDQ  Y8, Y7, Y3
+	VPUNPCKHQDQ  Y8, Y7, Y8
+	VMOVDQA      Y3, Y7
+	VPUNPCKLQDQ  Y10, Y9, Y3
+	VPUNPCKHQDQ  Y10, Y9, Y10
+	VMOVDQA      Y3, Y9
+	VPUNPCKLQDQ  Y12, Y11, Y3
+	VPUNPCKHQDQ  Y12, Y11, Y12
+	VMOVDQA      Y3, Y11
+	VPUNPCKLQDQ  Y14, Y13, Y3
+	VPUNPCKHQDQ  Y14, Y13, Y14
+	VMOVDQA      Y3, Y13
+	VPMOVZXDQ    832(CX), Y3
+	VPMOVZXDQ    848(CX), Y4
+	VPMOVZXDQ    864(CX), Y5
+	VPMOVZXDQ    880(CX), Y6
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VPSLLQ       $0x20, Y8, Y8
+	VPSLLQ       $0x20, Y10, Y10
+	VPSLLQ       $0x20, Y12, Y12
+	VPSLLQ       $0x20, Y14, Y14
+	VPBLENDD     $0xaa, Y8, Y7, Y7
+	VPBLENDD     $0xaa, Y10, Y9, Y9
+	VPBLENDD     $0xaa, Y12, Y11, Y11
+	VPBLENDD     $0xaa, Y14, Y13, Y13
+	VMOVDQU      Y7, 640(AX)
+	VMOVDQU      Y9, 672(AX)
+	VMOVDQU      Y11, 704(AX)
+	VMOVDQU      Y13, 736(AX)
+	VMOVDQA      1536(DX), Y7
+	VMOVDQA      1568(DX), Y8
+	VMOVDQA      1600(DX), Y9
+	VMOVDQA      1632(DX), Y10
+	VMOVDQA      1664(DX), Y11
+	VMOVDQA      1696(DX), Y12
+	VMOVDQA      1728(DX), Y13
+	VMOVDQA      1760(DX), Y14
+	VPBROADCASTD 56(CX), Y3
+	VPMULUDQ     Y11, Y3, Y11
+	VPMULUDQ     Y12, Y3, Y12
+	VPMULUDQ     Y13, Y3, Y13
+	VPMULUDQ     Y14, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y11, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y11
+	VPADDD       Y8, Y1, Y12
+	VPADDD       Y9, Y1, Y13
+	VPADDD       Y10, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y9, Y9
+	VPADDD       Y6, Y10, Y10
+	VPSUBD       Y3, Y11, Y11
+	VPSUBD       Y4, Y12, Y12
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 112(CX), Y3
+	VPBROADCASTD 116(CX), Y4
+	VPMULUDQ     Y9, Y3, Y9
+	VPMULUDQ     Y10, Y3, Y10
+	VPMULUDQ     Y13, Y4, Y13
+	VPMULUDQ     Y14, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y9, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y9
+	VPADDD       Y8, Y1, Y10
+	VPADDD       Y11, Y1, Y13
+	VPADDD       Y12, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y12, Y12
+	VPSUBD       Y3, Y9, Y9
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 224(CX), Y3
+	VPBROADCASTD 228(CX), Y4
+	VPBROADCASTD 232(CX), Y5
+	VPBROADCASTD 236(CX), Y6
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 448(CX), Y15
+	VPBROADCASTD 452(CX), Y3
+	VPBLENDD     $0xf0, Y3, Y15, Y3
+	VPBROADCASTD 456(CX), Y15
+	VPBROADCASTD 460(CX), Y4
+	VPBLENDD     $0xf0, Y4, Y15, Y4
+	VPBROADCASTD 464(CX), Y15
+	VPBROADCASTD 468(CX), Y5
+	VPBLENDD     $0xf0, Y5, Y15, Y5
+	VPBROADCASTD 472(CX), Y15
+	VPBROADCASTD 476(CX), Y6
+	VPBLENDD     $0xf0, Y6, Y15, Y6
+	VPERM2I128   $0x20, Y8, Y7, Y15
+	VPERM2I128   $0x31, Y8, Y7, Y8
+	VMOVDQA      Y15, Y7
+	VPERM2I128   $0x20, Y10, Y9, Y15
+	VPERM2I128   $0x31, Y10, Y9, Y10
+	VMOVDQA      Y15, Y9
+	VPERM2I128   $0x20, Y12, Y11, Y15
+	VPERM2I128   $0x31, Y12, Y11, Y12
+	VMOVDQA      Y15, Y11
+	VPERM2I128   $0x20, Y14, Y13, Y15
+	VPERM2I128   $0x31, Y14, Y13, Y14
+	VMOVDQA      Y15, Y13
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VPUNPCKLQDQ  Y8, Y7, Y3
+	VPUNPCKHQDQ  Y8, Y7, Y8
+	VMOVDQA      Y3, Y7
+	VPUNPCKLQDQ  Y10, Y9, Y3
+	VPUNPCKHQDQ  Y10, Y9, Y10
+	VMOVDQA      Y3, Y9
+	VPUNPCKLQDQ  Y12, Y11, Y3
+	VPUNPCKHQDQ  Y12, Y11, Y12
+	VMOVDQA      Y3, Y11
+	VPUNPCKLQDQ  Y14, Y13, Y3
+	VPUNPCKHQDQ  Y14, Y13, Y14
+	VMOVDQA      Y3, Y13
+	VPMOVZXDQ    896(CX), Y3
+	VPMOVZXDQ    912(CX), Y4
+	VPMOVZXDQ    928(CX), Y5
+	VPMOVZXDQ    944(CX), Y6
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VPSLLQ       $0x20, Y8, Y8
+	VPSLLQ       $0x20, Y10, Y10
+	VPSLLQ       $0x20, Y12, Y12
+	VPSLLQ       $0x20, Y14, Y14
+	VPBLENDD     $0xaa, Y8, Y7, Y7
+	VPBLENDD     $0xaa, Y10, Y9, Y9
+	VPBLENDD     $0xaa, Y12, Y11, Y11
+	VPBLENDD     $0xaa, Y14, Y13, Y13
+	VMOVDQU      Y7, 768(AX)
+	VMOVDQU      Y9, 800(AX)
+	VMOVDQU      Y11, 832(AX)
+	VMOVDQU      Y13, 864(AX)
+	VMOVDQA      1792(DX), Y7
+	VMOVDQA      1824(DX), Y8
+	VMOVDQA      1856(DX), Y9
+	VMOVDQA      1888(DX), Y10
+	VMOVDQA      1920(DX), Y11
+	VMOVDQA      1952(DX), Y12
+	VMOVDQA      1984(DX), Y13
+	VMOVDQA      2016(DX), Y14
+	VPBROADCASTD 60(CX), Y3
+	VPMULUDQ     Y11, Y3, Y11
+	VPMULUDQ     Y12, Y3, Y12
+	VPMULUDQ     Y13, Y3, Y13
+	VPMULUDQ     Y14, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y11, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y11
+	VPADDD       Y8, Y1, Y12
+	VPADDD       Y9, Y1, Y13
+	VPADDD       Y10, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y9, Y9
+	VPADDD       Y6, Y10, Y10
+	VPSUBD       Y3, Y11, Y11
+	VPSUBD       Y4, Y12, Y12
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 120(CX), Y3
+	VPBROADCASTD 124(CX), Y4
+	VPMULUDQ     Y9, Y3, Y9
+	VPMULUDQ     Y10, Y3, Y10
+	VPMULUDQ     Y13, Y4, Y13
+	VPMULUDQ     Y14, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y9, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y9
+	VPADDD       Y8, Y1, Y10
+	VPADDD       Y11, Y1, Y13
+	VPADDD       Y12, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y8, Y8
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y12, Y12
+	VPSUBD       Y3, Y9, Y9
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y13, Y13
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 240(CX), Y3
+	VPBROADCASTD 244(CX), Y4
+	VPBROADCASTD 248(CX), Y5
+	VPBROADCASTD 252(CX), Y6
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VPBROADCASTD 480(CX), Y15
+	VPBROADCASTD 484(CX), Y3
+	VPBLENDD     $0xf0, Y3, Y15, Y3
+	VPBROADCASTD 488(CX), Y15
+	VPBROADCASTD 492(CX), Y4
+	VPBLENDD     $0xf0, Y4, Y15, Y4
+	VPBROADCASTD 496(CX), Y15
+	VPBROADCASTD 500(CX), Y5
+	VPBLENDD     $0xf0, Y5, Y15, Y5
+	VPBROADCASTD 504(CX), Y15
+	VPBROADCASTD 508(CX), Y6
+	VPBLENDD     $0xf0, Y6, Y15, Y6
+	VPERM2I128   $0x20, Y8, Y7, Y15
+	VPERM2I128   $0x31, Y8, Y7, Y8
+	VMOVDQA      Y15, Y7
+	VPERM2I128   $0x20, Y10, Y9, Y15
+	VPERM2I128   $0x31, Y10, Y9, Y10
+	VMOVDQA      Y15, Y9
+	VPERM2I128   $0x20, Y12, Y11, Y15
+	VPERM2I128   $0x31, Y12, Y11, Y12
+	VMOVDQA      Y15, Y11
+	VPERM2I128   $0x20, Y14, Y13, Y15
+	VPERM2I128   $0x31, Y14, Y13, Y14
+	VMOVDQA      Y15, Y13
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y6, Y6
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y6, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y6, Y14, Y14
+	VPUNPCKLQDQ  Y8, Y7, Y3
+	VPUNPCKHQDQ  Y8, Y7, Y8
+	VMOVDQA      Y3, Y7
+	VPUNPCKLQDQ  Y10, Y9, Y3
+	VPUNPCKHQDQ  Y10, Y9, Y10
+	VMOVDQA      Y3, Y9
+	VPUNPCKLQDQ  Y12, Y11, Y3
+	VPUNPCKHQDQ  Y12, Y11, Y12
+	VMOVDQA      Y3, Y11
+	VPUNPCKLQDQ  Y14, Y13, Y3
+	VPUNPCKHQDQ  Y14, Y13, Y14
+	VMOVDQA      Y3, Y13
+	VPMOVZXDQ    960(CX), Y3
+	VPMOVZXDQ    976(CX), Y4
+	VPMOVZXDQ    992(CX), Y5
+	VPMOVZXDQ    1008(CX), Y6
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y10, Y4, Y10
+	VPMULUDQ     Y12, Y5, Y12
+	VPMULUDQ     Y14, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y10, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y14, Y2
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y2, Y2
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y10, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y2, Y14, Y2
+	VPSRLQ       $0x20, Y3, Y3
+	VPSRLQ       $0x20, Y4, Y4
+	VPSRLQ       $0x20, Y5, Y5
+	VPSRLQ       $0x20, Y2, Y2
+	VPADDD       Y7, Y1, Y8
+	VPADDD       Y9, Y1, Y10
+	VPADDD       Y11, Y1, Y12
+	VPADDD       Y13, Y1, Y14
+	VPADDD       Y3, Y7, Y7
+	VPADDD       Y4, Y9, Y9
+	VPADDD       Y5, Y11, Y11
+	VPADDD       Y2, Y13, Y13
+	VPSUBD       Y3, Y8, Y8
+	VPSUBD       Y4, Y10, Y10
+	VPSUBD       Y5, Y12, Y12
+	VPSUBD       Y2, Y14, Y14
+	VPSLLQ       $0x20, Y8, Y8
+	VPSLLQ       $0x20, Y10, Y10
+	VPSLLQ       $0x20, Y12, Y12
+	VPSLLQ       $0x20, Y14, Y14
+	VPBLENDD     $0xaa, Y8, Y7, Y7
+	VPBLENDD     $0xaa, Y10, Y9, Y9
+	VPBLENDD     $0xaa, Y12, Y11, Y11
+	VPBLENDD     $0xaa, Y14, Y13, Y13
+	VMOVDQU      Y7, 896(AX)
+	VMOVDQU      Y9, 928(AX)
+	VMOVDQU      Y11, 960(AX)
+	VMOVDQU      Y13, 992(AX)
+	RET
+
+// func invNttAVX2(p *[256]uint32)
+// Requires: AVX, AVX2
+TEXT ·invNttAVX2(SB), $2080-8
+	MOVQ         p+0(FP), AX
+	LEAQ         ·InvZetas+0(SB), CX
+	LEAQ         (SP), DX
+	MOVQ         $0xffffffffffffffe0, BX
+	ANDQ         BX, DX
+	MOVL         $0x007fe001, BX
+	VMOVD        BX, X0
+	VPBROADCASTD X0, Y0
+	MOVL         $0x7fe00100, BX
+	VMOVD        BX, X1
+	VPBROADCASTD X1, Y1
+	MOVL         $0xfc7fdfff, BX
+	VMOVD        BX, X2
+	VPBROADCASTD X2, Y2
+	VMOVDQU      (AX), Y7
+	VMOVDQU      32(AX), Y9
+	VMOVDQU      64(AX), Y11
+	VMOVDQU      96(AX), Y13
+	VPSRLQ       $0x20, Y7, Y8
+	VPSRLQ       $0x20, Y9, Y10
+	VPSRLQ       $0x20, Y11, Y12
+	VPSRLQ       $0x20, Y13, Y14
+	VPMOVZXDQ    (CX), Y3
+	VPMOVZXDQ    16(CX), Y4
+	VPMOVZXDQ    32(CX), Y5
+	VPMOVZXDQ    48(CX), Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPUNPCKLQDQ  Y8, Y7, Y3
+	VPUNPCKHQDQ  Y8, Y7, Y8
+	VMOVDQA      Y3, Y7
+	VPUNPCKLQDQ  Y10, Y9, Y3
+	VPUNPCKHQDQ  Y10, Y9, Y10
+	VMOVDQA      Y3, Y9
+	VPUNPCKLQDQ  Y12, Y11, Y3
+	VPUNPCKHQDQ  Y12, Y11, Y12
+	VMOVDQA      Y3, Y11
+	VPUNPCKLQDQ  Y14, Y13, Y3
+	VPUNPCKHQDQ  Y14, Y13, Y14
+	VMOVDQA      Y3, Y13
+	VPBROADCASTD 512(CX), Y15
+	VPBROADCASTD 516(CX), Y3
+	VPBLENDD     $0xf0, Y3, Y15, Y3
+	VPBROADCASTD 520(CX), Y15
+	VPBROADCASTD 524(CX), Y4
+	VPBLENDD     $0xf0, Y4, Y15, Y4
+	VPBROADCASTD 528(CX), Y15
+	VPBROADCASTD 532(CX), Y5
+	VPBLENDD     $0xf0, Y5, Y15, Y5
+	VPBROADCASTD 536(CX), Y15
+	VPBROADCASTD 540(CX), Y6
+	VPBLENDD     $0xf0, Y6, Y15, Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPERM2I128   $0x20, Y8, Y7, Y3
+	VPERM2I128   $0x31, Y8, Y7, Y8
+	VMOVDQA      Y3, Y7
+	VPERM2I128   $0x20, Y10, Y9, Y3
+	VPERM2I128   $0x31, Y10, Y9, Y10
+	VMOVDQA      Y3, Y9
+	VPERM2I128   $0x20, Y12, Y11, Y3
+	VPERM2I128   $0x31, Y12, Y11, Y12
+	VMOVDQA      Y3, Y11
+	VPERM2I128   $0x20, Y14, Y13, Y3
+	VPERM2I128   $0x31, Y14, Y13, Y14
+	VMOVDQA      Y3, Y13
+	VPBROADCASTD 768(CX), Y3
+	VPBROADCASTD 772(CX), Y4
+	VPBROADCASTD 776(CX), Y5
+	VPBROADCASTD 780(CX), Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPBROADCASTD 896(CX), Y3
+	VPBROADCASTD 900(CX), Y4
+	VPADDD       Y7, Y1, Y5
+	VPSUBD       Y9, Y5, Y5
+	VPADDD       Y7, Y9, Y7
+	VPMULUDQ     Y5, Y3, Y9
+	VPADDD       Y8, Y1, Y6
+	VPSUBD       Y10, Y6, Y6
+	VPADDD       Y8, Y10, Y8
+	VPMULUDQ     Y6, Y3, Y10
+	VPADDD       Y11, Y1, Y3
+	VPSUBD       Y13, Y3, Y3
+	VPADDD       Y11, Y13, Y11
+	VPMULUDQ     Y3, Y4, Y13
+	VPADDD       Y12, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y12, Y14, Y12
+	VPMULUDQ     Y15, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y5
+	VPMULUDQ     Y2, Y10, Y6
+	VPMULUDQ     Y2, Y13, Y3
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y5, Y9, Y5
+	VPADDQ       Y6, Y10, Y6
+	VPADDQ       Y3, Y13, Y3
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y5, Y9
+	VPSRLQ       $0x20, Y6, Y10
+	VPSRLQ       $0x20, Y3, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	VPBROADCASTD 960(CX), Y3
+	VPADDD       Y7, Y1, Y4
+	VPSUBD       Y11, Y4, Y4
+	VPADDD       Y7, Y11, Y7
+	VPMULUDQ     Y4, Y3, Y11
+	VPADDD       Y8, Y1, Y5
+	VPSUBD       Y12, Y5, Y5
+	VPADDD       Y8, Y12, Y8
+	VPMULUDQ     Y5, Y3, Y12
+	VPADDD       Y9, Y1, Y6
+	VPSUBD       Y13, Y6, Y6
+	VPADDD       Y9, Y13, Y9
+	VPMULUDQ     Y6, Y3, Y13
+	VPADDD       Y10, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y10, Y14, Y10
+	VPMULUDQ     Y15, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y13, Y6
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y4, Y11, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y13, Y6
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y4, Y11
+	VPSRLQ       $0x20, Y5, Y12
+	VPSRLQ       $0x20, Y6, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	VMOVDQA      Y7, (DX)
+	VMOVDQA      Y8, 32(DX)
+	VMOVDQA      Y9, 64(DX)
+	VMOVDQA      Y10, 96(DX)
+	VMOVDQA      Y11, 128(DX)
+	VMOVDQA      Y12, 160(DX)
+	VMOVDQA      Y13, 192(DX)
+	VMOVDQA      Y14, 224(DX)
+	VMOVDQU      128(AX), Y7
+	VMOVDQU      160(AX), Y9
+	VMOVDQU      192(AX), Y11
+	VMOVDQU      224(AX), Y13
+	VPSRLQ       $0x20, Y7, Y8
+	VPSRLQ       $0x20, Y9, Y10
+	VPSRLQ       $0x20, Y11, Y12
+	VPSRLQ       $0x20, Y13, Y14
+	VPMOVZXDQ    64(CX), Y3
+	VPMOVZXDQ    80(CX), Y4
+	VPMOVZXDQ    96(CX), Y5
+	VPMOVZXDQ    112(CX), Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPUNPCKLQDQ  Y8, Y7, Y3
+	VPUNPCKHQDQ  Y8, Y7, Y8
+	VMOVDQA      Y3, Y7
+	VPUNPCKLQDQ  Y10, Y9, Y3
+	VPUNPCKHQDQ  Y10, Y9, Y10
+	VMOVDQA      Y3, Y9
+	VPUNPCKLQDQ  Y12, Y11, Y3
+	VPUNPCKHQDQ  Y12, Y11, Y12
+	VMOVDQA      Y3, Y11
+	VPUNPCKLQDQ  Y14, Y13, Y3
+	VPUNPCKHQDQ  Y14, Y13, Y14
+	VMOVDQA      Y3, Y13
+	VPBROADCASTD 544(CX), Y15
+	VPBROADCASTD 548(CX), Y3
+	VPBLENDD     $0xf0, Y3, Y15, Y3
+	VPBROADCASTD 552(CX), Y15
+	VPBROADCASTD 556(CX), Y4
+	VPBLENDD     $0xf0, Y4, Y15, Y4
+	VPBROADCASTD 560(CX), Y15
+	VPBROADCASTD 564(CX), Y5
+	VPBLENDD     $0xf0, Y5, Y15, Y5
+	VPBROADCASTD 568(CX), Y15
+	VPBROADCASTD 572(CX), Y6
+	VPBLENDD     $0xf0, Y6, Y15, Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPERM2I128   $0x20, Y8, Y7, Y3
+	VPERM2I128   $0x31, Y8, Y7, Y8
+	VMOVDQA      Y3, Y7
+	VPERM2I128   $0x20, Y10, Y9, Y3
+	VPERM2I128   $0x31, Y10, Y9, Y10
+	VMOVDQA      Y3, Y9
+	VPERM2I128   $0x20, Y12, Y11, Y3
+	VPERM2I128   $0x31, Y12, Y11, Y12
+	VMOVDQA      Y3, Y11
+	VPERM2I128   $0x20, Y14, Y13, Y3
+	VPERM2I128   $0x31, Y14, Y13, Y14
+	VMOVDQA      Y3, Y13
+	VPBROADCASTD 784(CX), Y3
+	VPBROADCASTD 788(CX), Y4
+	VPBROADCASTD 792(CX), Y5
+	VPBROADCASTD 796(CX), Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPBROADCASTD 904(CX), Y3
+	VPBROADCASTD 908(CX), Y4
+	VPADDD       Y7, Y1, Y5
+	VPSUBD       Y9, Y5, Y5
+	VPADDD       Y7, Y9, Y7
+	VPMULUDQ     Y5, Y3, Y9
+	VPADDD       Y8, Y1, Y6
+	VPSUBD       Y10, Y6, Y6
+	VPADDD       Y8, Y10, Y8
+	VPMULUDQ     Y6, Y3, Y10
+	VPADDD       Y11, Y1, Y3
+	VPSUBD       Y13, Y3, Y3
+	VPADDD       Y11, Y13, Y11
+	VPMULUDQ     Y3, Y4, Y13
+	VPADDD       Y12, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y12, Y14, Y12
+	VPMULUDQ     Y15, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y5
+	VPMULUDQ     Y2, Y10, Y6
+	VPMULUDQ     Y2, Y13, Y3
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y5, Y9, Y5
+	VPADDQ       Y6, Y10, Y6
+	VPADDQ       Y3, Y13, Y3
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y5, Y9
+	VPSRLQ       $0x20, Y6, Y10
+	VPSRLQ       $0x20, Y3, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	VPBROADCASTD 964(CX), Y3
+	VPADDD       Y7, Y1, Y4
+	VPSUBD       Y11, Y4, Y4
+	VPADDD       Y7, Y11, Y7
+	VPMULUDQ     Y4, Y3, Y11
+	VPADDD       Y8, Y1, Y5
+	VPSUBD       Y12, Y5, Y5
+	VPADDD       Y8, Y12, Y8
+	VPMULUDQ     Y5, Y3, Y12
+	VPADDD       Y9, Y1, Y6
+	VPSUBD       Y13, Y6, Y6
+	VPADDD       Y9, Y13, Y9
+	VPMULUDQ     Y6, Y3, Y13
+	VPADDD       Y10, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y10, Y14, Y10
+	VPMULUDQ     Y15, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y13, Y6
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y4, Y11, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y13, Y6
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y4, Y11
+	VPSRLQ       $0x20, Y5, Y12
+	VPSRLQ       $0x20, Y6, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	VMOVDQA      Y7, 256(DX)
+	VMOVDQA      Y8, 288(DX)
+	VMOVDQA      Y9, 320(DX)
+	VMOVDQA      Y10, 352(DX)
+	VMOVDQA      Y11, 384(DX)
+	VMOVDQA      Y12, 416(DX)
+	VMOVDQA      Y13, 448(DX)
+	VMOVDQA      Y14, 480(DX)
+	VMOVDQU      256(AX), Y7
+	VMOVDQU      288(AX), Y9
+	VMOVDQU      320(AX), Y11
+	VMOVDQU      352(AX), Y13
+	VPSRLQ       $0x20, Y7, Y8
+	VPSRLQ       $0x20, Y9, Y10
+	VPSRLQ       $0x20, Y11, Y12
+	VPSRLQ       $0x20, Y13, Y14
+	VPMOVZXDQ    128(CX), Y3
+	VPMOVZXDQ    144(CX), Y4
+	VPMOVZXDQ    160(CX), Y5
+	VPMOVZXDQ    176(CX), Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPUNPCKLQDQ  Y8, Y7, Y3
+	VPUNPCKHQDQ  Y8, Y7, Y8
+	VMOVDQA      Y3, Y7
+	VPUNPCKLQDQ  Y10, Y9, Y3
+	VPUNPCKHQDQ  Y10, Y9, Y10
+	VMOVDQA      Y3, Y9
+	VPUNPCKLQDQ  Y12, Y11, Y3
+	VPUNPCKHQDQ  Y12, Y11, Y12
+	VMOVDQA      Y3, Y11
+	VPUNPCKLQDQ  Y14, Y13, Y3
+	VPUNPCKHQDQ  Y14, Y13, Y14
+	VMOVDQA      Y3, Y13
+	VPBROADCASTD 576(CX), Y15
+	VPBROADCASTD 580(CX), Y3
+	VPBLENDD     $0xf0, Y3, Y15, Y3
+	VPBROADCASTD 584(CX), Y15
+	VPBROADCASTD 588(CX), Y4
+	VPBLENDD     $0xf0, Y4, Y15, Y4
+	VPBROADCASTD 592(CX), Y15
+	VPBROADCASTD 596(CX), Y5
+	VPBLENDD     $0xf0, Y5, Y15, Y5
+	VPBROADCASTD 600(CX), Y15
+	VPBROADCASTD 604(CX), Y6
+	VPBLENDD     $0xf0, Y6, Y15, Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPERM2I128   $0x20, Y8, Y7, Y3
+	VPERM2I128   $0x31, Y8, Y7, Y8
+	VMOVDQA      Y3, Y7
+	VPERM2I128   $0x20, Y10, Y9, Y3
+	VPERM2I128   $0x31, Y10, Y9, Y10
+	VMOVDQA      Y3, Y9
+	VPERM2I128   $0x20, Y12, Y11, Y3
+	VPERM2I128   $0x31, Y12, Y11, Y12
+	VMOVDQA      Y3, Y11
+	VPERM2I128   $0x20, Y14, Y13, Y3
+	VPERM2I128   $0x31, Y14, Y13, Y14
+	VMOVDQA      Y3, Y13
+	VPBROADCASTD 800(CX), Y3
+	VPBROADCASTD 804(CX), Y4
+	VPBROADCASTD 808(CX), Y5
+	VPBROADCASTD 812(CX), Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPBROADCASTD 912(CX), Y3
+	VPBROADCASTD 916(CX), Y4
+	VPADDD       Y7, Y1, Y5
+	VPSUBD       Y9, Y5, Y5
+	VPADDD       Y7, Y9, Y7
+	VPMULUDQ     Y5, Y3, Y9
+	VPADDD       Y8, Y1, Y6
+	VPSUBD       Y10, Y6, Y6
+	VPADDD       Y8, Y10, Y8
+	VPMULUDQ     Y6, Y3, Y10
+	VPADDD       Y11, Y1, Y3
+	VPSUBD       Y13, Y3, Y3
+	VPADDD       Y11, Y13, Y11
+	VPMULUDQ     Y3, Y4, Y13
+	VPADDD       Y12, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y12, Y14, Y12
+	VPMULUDQ     Y15, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y5
+	VPMULUDQ     Y2, Y10, Y6
+	VPMULUDQ     Y2, Y13, Y3
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y5, Y9, Y5
+	VPADDQ       Y6, Y10, Y6
+	VPADDQ       Y3, Y13, Y3
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y5, Y9
+	VPSRLQ       $0x20, Y6, Y10
+	VPSRLQ       $0x20, Y3, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	VPBROADCASTD 968(CX), Y3
+	VPADDD       Y7, Y1, Y4
+	VPSUBD       Y11, Y4, Y4
+	VPADDD       Y7, Y11, Y7
+	VPMULUDQ     Y4, Y3, Y11
+	VPADDD       Y8, Y1, Y5
+	VPSUBD       Y12, Y5, Y5
+	VPADDD       Y8, Y12, Y8
+	VPMULUDQ     Y5, Y3, Y12
+	VPADDD       Y9, Y1, Y6
+	VPSUBD       Y13, Y6, Y6
+	VPADDD       Y9, Y13, Y9
+	VPMULUDQ     Y6, Y3, Y13
+	VPADDD       Y10, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y10, Y14, Y10
+	VPMULUDQ     Y15, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y13, Y6
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y4, Y11, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y13, Y6
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y4, Y11
+	VPSRLQ       $0x20, Y5, Y12
+	VPSRLQ       $0x20, Y6, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	VMOVDQA      Y7, 512(DX)
+	VMOVDQA      Y8, 544(DX)
+	VMOVDQA      Y9, 576(DX)
+	VMOVDQA      Y10, 608(DX)
+	VMOVDQA      Y11, 640(DX)
+	VMOVDQA      Y12, 672(DX)
+	VMOVDQA      Y13, 704(DX)
+	VMOVDQA      Y14, 736(DX)
+	VMOVDQU      384(AX), Y7
+	VMOVDQU      416(AX), Y9
+	VMOVDQU      448(AX), Y11
+	VMOVDQU      480(AX), Y13
+	VPSRLQ       $0x20, Y7, Y8
+	VPSRLQ       $0x20, Y9, Y10
+	VPSRLQ       $0x20, Y11, Y12
+	VPSRLQ       $0x20, Y13, Y14
+	VPMOVZXDQ    192(CX), Y3
+	VPMOVZXDQ    208(CX), Y4
+	VPMOVZXDQ    224(CX), Y5
+	VPMOVZXDQ    240(CX), Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPUNPCKLQDQ  Y8, Y7, Y3
+	VPUNPCKHQDQ  Y8, Y7, Y8
+	VMOVDQA      Y3, Y7
+	VPUNPCKLQDQ  Y10, Y9, Y3
+	VPUNPCKHQDQ  Y10, Y9, Y10
+	VMOVDQA      Y3, Y9
+	VPUNPCKLQDQ  Y12, Y11, Y3
+	VPUNPCKHQDQ  Y12, Y11, Y12
+	VMOVDQA      Y3, Y11
+	VPUNPCKLQDQ  Y14, Y13, Y3
+	VPUNPCKHQDQ  Y14, Y13, Y14
+	VMOVDQA      Y3, Y13
+	VPBROADCASTD 608(CX), Y15
+	VPBROADCASTD 612(CX), Y3
+	VPBLENDD     $0xf0, Y3, Y15, Y3
+	VPBROADCASTD 616(CX), Y15
+	VPBROADCASTD 620(CX), Y4
+	VPBLENDD     $0xf0, Y4, Y15, Y4
+	VPBROADCASTD 624(CX), Y15
+	VPBROADCASTD 628(CX), Y5
+	VPBLENDD     $0xf0, Y5, Y15, Y5
+	VPBROADCASTD 632(CX), Y15
+	VPBROADCASTD 636(CX), Y6
+	VPBLENDD     $0xf0, Y6, Y15, Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPERM2I128   $0x20, Y8, Y7, Y3
+	VPERM2I128   $0x31, Y8, Y7, Y8
+	VMOVDQA      Y3, Y7
+	VPERM2I128   $0x20, Y10, Y9, Y3
+	VPERM2I128   $0x31, Y10, Y9, Y10
+	VMOVDQA      Y3, Y9
+	VPERM2I128   $0x20, Y12, Y11, Y3
+	VPERM2I128   $0x31, Y12, Y11, Y12
+	VMOVDQA      Y3, Y11
+	VPERM2I128   $0x20, Y14, Y13, Y3
+	VPERM2I128   $0x31, Y14, Y13, Y14
+	VMOVDQA      Y3, Y13
+	VPBROADCASTD 816(CX), Y3
+	VPBROADCASTD 820(CX), Y4
+	VPBROADCASTD 824(CX), Y5
+	VPBROADCASTD 828(CX), Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPBROADCASTD 920(CX), Y3
+	VPBROADCASTD 924(CX), Y4
+	VPADDD       Y7, Y1, Y5
+	VPSUBD       Y9, Y5, Y5
+	VPADDD       Y7, Y9, Y7
+	VPMULUDQ     Y5, Y3, Y9
+	VPADDD       Y8, Y1, Y6
+	VPSUBD       Y10, Y6, Y6
+	VPADDD       Y8, Y10, Y8
+	VPMULUDQ     Y6, Y3, Y10
+	VPADDD       Y11, Y1, Y3
+	VPSUBD       Y13, Y3, Y3
+	VPADDD       Y11, Y13, Y11
+	VPMULUDQ     Y3, Y4, Y13
+	VPADDD       Y12, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y12, Y14, Y12
+	VPMULUDQ     Y15, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y5
+	VPMULUDQ     Y2, Y10, Y6
+	VPMULUDQ     Y2, Y13, Y3
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y5, Y9, Y5
+	VPADDQ       Y6, Y10, Y6
+	VPADDQ       Y3, Y13, Y3
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y5, Y9
+	VPSRLQ       $0x20, Y6, Y10
+	VPSRLQ       $0x20, Y3, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	VPBROADCASTD 972(CX), Y3
+	VPADDD       Y7, Y1, Y4
+	VPSUBD       Y11, Y4, Y4
+	VPADDD       Y7, Y11, Y7
+	VPMULUDQ     Y4, Y3, Y11
+	VPADDD       Y8, Y1, Y5
+	VPSUBD       Y12, Y5, Y5
+	VPADDD       Y8, Y12, Y8
+	VPMULUDQ     Y5, Y3, Y12
+	VPADDD       Y9, Y1, Y6
+	VPSUBD       Y13, Y6, Y6
+	VPADDD       Y9, Y13, Y9
+	VPMULUDQ     Y6, Y3, Y13
+	VPADDD       Y10, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y10, Y14, Y10
+	VPMULUDQ     Y15, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y13, Y6
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y4, Y11, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y13, Y6
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y4, Y11
+	VPSRLQ       $0x20, Y5, Y12
+	VPSRLQ       $0x20, Y6, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	VMOVDQA      Y7, 768(DX)
+	VMOVDQA      Y8, 800(DX)
+	VMOVDQA      Y9, 832(DX)
+	VMOVDQA      Y10, 864(DX)
+	VMOVDQA      Y11, 896(DX)
+	VMOVDQA      Y12, 928(DX)
+	VMOVDQA      Y13, 960(DX)
+	VMOVDQA      Y14, 992(DX)
+	VMOVDQU      512(AX), Y7
+	VMOVDQU      544(AX), Y9
+	VMOVDQU      576(AX), Y11
+	VMOVDQU      608(AX), Y13
+	VPSRLQ       $0x20, Y7, Y8
+	VPSRLQ       $0x20, Y9, Y10
+	VPSRLQ       $0x20, Y11, Y12
+	VPSRLQ       $0x20, Y13, Y14
+	VPMOVZXDQ    256(CX), Y3
+	VPMOVZXDQ    272(CX), Y4
+	VPMOVZXDQ    288(CX), Y5
+	VPMOVZXDQ    304(CX), Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPUNPCKLQDQ  Y8, Y7, Y3
+	VPUNPCKHQDQ  Y8, Y7, Y8
+	VMOVDQA      Y3, Y7
+	VPUNPCKLQDQ  Y10, Y9, Y3
+	VPUNPCKHQDQ  Y10, Y9, Y10
+	VMOVDQA      Y3, Y9
+	VPUNPCKLQDQ  Y12, Y11, Y3
+	VPUNPCKHQDQ  Y12, Y11, Y12
+	VMOVDQA      Y3, Y11
+	VPUNPCKLQDQ  Y14, Y13, Y3
+	VPUNPCKHQDQ  Y14, Y13, Y14
+	VMOVDQA      Y3, Y13
+	VPBROADCASTD 640(CX), Y15
+	VPBROADCASTD 644(CX), Y3
+	VPBLENDD     $0xf0, Y3, Y15, Y3
+	VPBROADCASTD 648(CX), Y15
+	VPBROADCASTD 652(CX), Y4
+	VPBLENDD     $0xf0, Y4, Y15, Y4
+	VPBROADCASTD 656(CX), Y15
+	VPBROADCASTD 660(CX), Y5
+	VPBLENDD     $0xf0, Y5, Y15, Y5
+	VPBROADCASTD 664(CX), Y15
+	VPBROADCASTD 668(CX), Y6
+	VPBLENDD     $0xf0, Y6, Y15, Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPERM2I128   $0x20, Y8, Y7, Y3
+	VPERM2I128   $0x31, Y8, Y7, Y8
+	VMOVDQA      Y3, Y7
+	VPERM2I128   $0x20, Y10, Y9, Y3
+	VPERM2I128   $0x31, Y10, Y9, Y10
+	VMOVDQA      Y3, Y9
+	VPERM2I128   $0x20, Y12, Y11, Y3
+	VPERM2I128   $0x31, Y12, Y11, Y12
+	VMOVDQA      Y3, Y11
+	VPERM2I128   $0x20, Y14, Y13, Y3
+	VPERM2I128   $0x31, Y14, Y13, Y14
+	VMOVDQA      Y3, Y13
+	VPBROADCASTD 832(CX), Y3
+	VPBROADCASTD 836(CX), Y4
+	VPBROADCASTD 840(CX), Y5
+	VPBROADCASTD 844(CX), Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPBROADCASTD 928(CX), Y3
+	VPBROADCASTD 932(CX), Y4
+	VPADDD       Y7, Y1, Y5
+	VPSUBD       Y9, Y5, Y5
+	VPADDD       Y7, Y9, Y7
+	VPMULUDQ     Y5, Y3, Y9
+	VPADDD       Y8, Y1, Y6
+	VPSUBD       Y10, Y6, Y6
+	VPADDD       Y8, Y10, Y8
+	VPMULUDQ     Y6, Y3, Y10
+	VPADDD       Y11, Y1, Y3
+	VPSUBD       Y13, Y3, Y3
+	VPADDD       Y11, Y13, Y11
+	VPMULUDQ     Y3, Y4, Y13
+	VPADDD       Y12, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y12, Y14, Y12
+	VPMULUDQ     Y15, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y5
+	VPMULUDQ     Y2, Y10, Y6
+	VPMULUDQ     Y2, Y13, Y3
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y5, Y9, Y5
+	VPADDQ       Y6, Y10, Y6
+	VPADDQ       Y3, Y13, Y3
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y5, Y9
+	VPSRLQ       $0x20, Y6, Y10
+	VPSRLQ       $0x20, Y3, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	VPBROADCASTD 976(CX), Y3
+	VPADDD       Y7, Y1, Y4
+	VPSUBD       Y11, Y4, Y4
+	VPADDD       Y7, Y11, Y7
+	VPMULUDQ     Y4, Y3, Y11
+	VPADDD       Y8, Y1, Y5
+	VPSUBD       Y12, Y5, Y5
+	VPADDD       Y8, Y12, Y8
+	VPMULUDQ     Y5, Y3, Y12
+	VPADDD       Y9, Y1, Y6
+	VPSUBD       Y13, Y6, Y6
+	VPADDD       Y9, Y13, Y9
+	VPMULUDQ     Y6, Y3, Y13
+	VPADDD       Y10, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y10, Y14, Y10
+	VPMULUDQ     Y15, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y13, Y6
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y4, Y11, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y13, Y6
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y4, Y11
+	VPSRLQ       $0x20, Y5, Y12
+	VPSRLQ       $0x20, Y6, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	VMOVDQA      Y7, 1024(DX)
+	VMOVDQA      Y8, 1056(DX)
+	VMOVDQA      Y9, 1088(DX)
+	VMOVDQA      Y10, 1120(DX)
+	VMOVDQA      Y11, 1152(DX)
+	VMOVDQA      Y12, 1184(DX)
+	VMOVDQA      Y13, 1216(DX)
+	VMOVDQA      Y14, 1248(DX)
+	VMOVDQU      640(AX), Y7
+	VMOVDQU      672(AX), Y9
+	VMOVDQU      704(AX), Y11
+	VMOVDQU      736(AX), Y13
+	VPSRLQ       $0x20, Y7, Y8
+	VPSRLQ       $0x20, Y9, Y10
+	VPSRLQ       $0x20, Y11, Y12
+	VPSRLQ       $0x20, Y13, Y14
+	VPMOVZXDQ    320(CX), Y3
+	VPMOVZXDQ    336(CX), Y4
+	VPMOVZXDQ    352(CX), Y5
+	VPMOVZXDQ    368(CX), Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPUNPCKLQDQ  Y8, Y7, Y3
+	VPUNPCKHQDQ  Y8, Y7, Y8
+	VMOVDQA      Y3, Y7
+	VPUNPCKLQDQ  Y10, Y9, Y3
+	VPUNPCKHQDQ  Y10, Y9, Y10
+	VMOVDQA      Y3, Y9
+	VPUNPCKLQDQ  Y12, Y11, Y3
+	VPUNPCKHQDQ  Y12, Y11, Y12
+	VMOVDQA      Y3, Y11
+	VPUNPCKLQDQ  Y14, Y13, Y3
+	VPUNPCKHQDQ  Y14, Y13, Y14
+	VMOVDQA      Y3, Y13
+	VPBROADCASTD 672(CX), Y15
+	VPBROADCASTD 676(CX), Y3
+	VPBLENDD     $0xf0, Y3, Y15, Y3
+	VPBROADCASTD 680(CX), Y15
+	VPBROADCASTD 684(CX), Y4
+	VPBLENDD     $0xf0, Y4, Y15, Y4
+	VPBROADCASTD 688(CX), Y15
+	VPBROADCASTD 692(CX), Y5
+	VPBLENDD     $0xf0, Y5, Y15, Y5
+	VPBROADCASTD 696(CX), Y15
+	VPBROADCASTD 700(CX), Y6
+	VPBLENDD     $0xf0, Y6, Y15, Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPERM2I128   $0x20, Y8, Y7, Y3
+	VPERM2I128   $0x31, Y8, Y7, Y8
+	VMOVDQA      Y3, Y7
+	VPERM2I128   $0x20, Y10, Y9, Y3
+	VPERM2I128   $0x31, Y10, Y9, Y10
+	VMOVDQA      Y3, Y9
+	VPERM2I128   $0x20, Y12, Y11, Y3
+	VPERM2I128   $0x31, Y12, Y11, Y12
+	VMOVDQA      Y3, Y11
+	VPERM2I128   $0x20, Y14, Y13, Y3
+	VPERM2I128   $0x31, Y14, Y13, Y14
+	VMOVDQA      Y3, Y13
+	VPBROADCASTD 848(CX), Y3
+	VPBROADCASTD 852(CX), Y4
+	VPBROADCASTD 856(CX), Y5
+	VPBROADCASTD 860(CX), Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPBROADCASTD 936(CX), Y3
+	VPBROADCASTD 940(CX), Y4
+	VPADDD       Y7, Y1, Y5
+	VPSUBD       Y9, Y5, Y5
+	VPADDD       Y7, Y9, Y7
+	VPMULUDQ     Y5, Y3, Y9
+	VPADDD       Y8, Y1, Y6
+	VPSUBD       Y10, Y6, Y6
+	VPADDD       Y8, Y10, Y8
+	VPMULUDQ     Y6, Y3, Y10
+	VPADDD       Y11, Y1, Y3
+	VPSUBD       Y13, Y3, Y3
+	VPADDD       Y11, Y13, Y11
+	VPMULUDQ     Y3, Y4, Y13
+	VPADDD       Y12, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y12, Y14, Y12
+	VPMULUDQ     Y15, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y5
+	VPMULUDQ     Y2, Y10, Y6
+	VPMULUDQ     Y2, Y13, Y3
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y5, Y9, Y5
+	VPADDQ       Y6, Y10, Y6
+	VPADDQ       Y3, Y13, Y3
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y5, Y9
+	VPSRLQ       $0x20, Y6, Y10
+	VPSRLQ       $0x20, Y3, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	VPBROADCASTD 980(CX), Y3
+	VPADDD       Y7, Y1, Y4
+	VPSUBD       Y11, Y4, Y4
+	VPADDD       Y7, Y11, Y7
+	VPMULUDQ     Y4, Y3, Y11
+	VPADDD       Y8, Y1, Y5
+	VPSUBD       Y12, Y5, Y5
+	VPADDD       Y8, Y12, Y8
+	VPMULUDQ     Y5, Y3, Y12
+	VPADDD       Y9, Y1, Y6
+	VPSUBD       Y13, Y6, Y6
+	VPADDD       Y9, Y13, Y9
+	VPMULUDQ     Y6, Y3, Y13
+	VPADDD       Y10, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y10, Y14, Y10
+	VPMULUDQ     Y15, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y13, Y6
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y4, Y11, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y13, Y6
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y4, Y11
+	VPSRLQ       $0x20, Y5, Y12
+	VPSRLQ       $0x20, Y6, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	VMOVDQA      Y7, 1280(DX)
+	VMOVDQA      Y8, 1312(DX)
+	VMOVDQA      Y9, 1344(DX)
+	VMOVDQA      Y10, 1376(DX)
+	VMOVDQA      Y11, 1408(DX)
+	VMOVDQA      Y12, 1440(DX)
+	VMOVDQA      Y13, 1472(DX)
+	VMOVDQA      Y14, 1504(DX)
+	VMOVDQU      768(AX), Y7
+	VMOVDQU      800(AX), Y9
+	VMOVDQU      832(AX), Y11
+	VMOVDQU      864(AX), Y13
+	VPSRLQ       $0x20, Y7, Y8
+	VPSRLQ       $0x20, Y9, Y10
+	VPSRLQ       $0x20, Y11, Y12
+	VPSRLQ       $0x20, Y13, Y14
+	VPMOVZXDQ    384(CX), Y3
+	VPMOVZXDQ    400(CX), Y4
+	VPMOVZXDQ    416(CX), Y5
+	VPMOVZXDQ    432(CX), Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPUNPCKLQDQ  Y8, Y7, Y3
+	VPUNPCKHQDQ  Y8, Y7, Y8
+	VMOVDQA      Y3, Y7
+	VPUNPCKLQDQ  Y10, Y9, Y3
+	VPUNPCKHQDQ  Y10, Y9, Y10
+	VMOVDQA      Y3, Y9
+	VPUNPCKLQDQ  Y12, Y11, Y3
+	VPUNPCKHQDQ  Y12, Y11, Y12
+	VMOVDQA      Y3, Y11
+	VPUNPCKLQDQ  Y14, Y13, Y3
+	VPUNPCKHQDQ  Y14, Y13, Y14
+	VMOVDQA      Y3, Y13
+	VPBROADCASTD 704(CX), Y15
+	VPBROADCASTD 708(CX), Y3
+	VPBLENDD     $0xf0, Y3, Y15, Y3
+	VPBROADCASTD 712(CX), Y15
+	VPBROADCASTD 716(CX), Y4
+	VPBLENDD     $0xf0, Y4, Y15, Y4
+	VPBROADCASTD 720(CX), Y15
+	VPBROADCASTD 724(CX), Y5
+	VPBLENDD     $0xf0, Y5, Y15, Y5
+	VPBROADCASTD 728(CX), Y15
+	VPBROADCASTD 732(CX), Y6
+	VPBLENDD     $0xf0, Y6, Y15, Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPERM2I128   $0x20, Y8, Y7, Y3
+	VPERM2I128   $0x31, Y8, Y7, Y8
+	VMOVDQA      Y3, Y7
+	VPERM2I128   $0x20, Y10, Y9, Y3
+	VPERM2I128   $0x31, Y10, Y9, Y10
+	VMOVDQA      Y3, Y9
+	VPERM2I128   $0x20, Y12, Y11, Y3
+	VPERM2I128   $0x31, Y12, Y11, Y12
+	VMOVDQA      Y3, Y11
+	VPERM2I128   $0x20, Y14, Y13, Y3
+	VPERM2I128   $0x31, Y14, Y13, Y14
+	VMOVDQA      Y3, Y13
+	VPBROADCASTD 864(CX), Y3
+	VPBROADCASTD 868(CX), Y4
+	VPBROADCASTD 872(CX), Y5
+	VPBROADCASTD 876(CX), Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPBROADCASTD 944(CX), Y3
+	VPBROADCASTD 948(CX), Y4
+	VPADDD       Y7, Y1, Y5
+	VPSUBD       Y9, Y5, Y5
+	VPADDD       Y7, Y9, Y7
+	VPMULUDQ     Y5, Y3, Y9
+	VPADDD       Y8, Y1, Y6
+	VPSUBD       Y10, Y6, Y6
+	VPADDD       Y8, Y10, Y8
+	VPMULUDQ     Y6, Y3, Y10
+	VPADDD       Y11, Y1, Y3
+	VPSUBD       Y13, Y3, Y3
+	VPADDD       Y11, Y13, Y11
+	VPMULUDQ     Y3, Y4, Y13
+	VPADDD       Y12, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y12, Y14, Y12
+	VPMULUDQ     Y15, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y5
+	VPMULUDQ     Y2, Y10, Y6
+	VPMULUDQ     Y2, Y13, Y3
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y5, Y9, Y5
+	VPADDQ       Y6, Y10, Y6
+	VPADDQ       Y3, Y13, Y3
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y5, Y9
+	VPSRLQ       $0x20, Y6, Y10
+	VPSRLQ       $0x20, Y3, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	VPBROADCASTD 984(CX), Y3
+	VPADDD       Y7, Y1, Y4
+	VPSUBD       Y11, Y4, Y4
+	VPADDD       Y7, Y11, Y7
+	VPMULUDQ     Y4, Y3, Y11
+	VPADDD       Y8, Y1, Y5
+	VPSUBD       Y12, Y5, Y5
+	VPADDD       Y8, Y12, Y8
+	VPMULUDQ     Y5, Y3, Y12
+	VPADDD       Y9, Y1, Y6
+	VPSUBD       Y13, Y6, Y6
+	VPADDD       Y9, Y13, Y9
+	VPMULUDQ     Y6, Y3, Y13
+	VPADDD       Y10, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y10, Y14, Y10
+	VPMULUDQ     Y15, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y13, Y6
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y4, Y11, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y13, Y6
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y4, Y11
+	VPSRLQ       $0x20, Y5, Y12
+	VPSRLQ       $0x20, Y6, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	VMOVDQA      Y7, 1536(DX)
+	VMOVDQA      Y8, 1568(DX)
+	VMOVDQA      Y9, 1600(DX)
+	VMOVDQA      Y10, 1632(DX)
+	VMOVDQA      Y11, 1664(DX)
+	VMOVDQA      Y12, 1696(DX)
+	VMOVDQA      Y13, 1728(DX)
+	VMOVDQA      Y14, 1760(DX)
+	VMOVDQU      896(AX), Y7
+	VMOVDQU      928(AX), Y9
+	VMOVDQU      960(AX), Y11
+	VMOVDQU      992(AX), Y13
+	VPSRLQ       $0x20, Y7, Y8
+	VPSRLQ       $0x20, Y9, Y10
+	VPSRLQ       $0x20, Y11, Y12
+	VPSRLQ       $0x20, Y13, Y14
+	VPMOVZXDQ    448(CX), Y3
+	VPMOVZXDQ    464(CX), Y4
+	VPMOVZXDQ    480(CX), Y5
+	VPMOVZXDQ    496(CX), Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPUNPCKLQDQ  Y8, Y7, Y3
+	VPUNPCKHQDQ  Y8, Y7, Y8
+	VMOVDQA      Y3, Y7
+	VPUNPCKLQDQ  Y10, Y9, Y3
+	VPUNPCKHQDQ  Y10, Y9, Y10
+	VMOVDQA      Y3, Y9
+	VPUNPCKLQDQ  Y12, Y11, Y3
+	VPUNPCKHQDQ  Y12, Y11, Y12
+	VMOVDQA      Y3, Y11
+	VPUNPCKLQDQ  Y14, Y13, Y3
+	VPUNPCKHQDQ  Y14, Y13, Y14
+	VMOVDQA      Y3, Y13
+	VPBROADCASTD 736(CX), Y15
+	VPBROADCASTD 740(CX), Y3
+	VPBLENDD     $0xf0, Y3, Y15, Y3
+	VPBROADCASTD 744(CX), Y15
+	VPBROADCASTD 748(CX), Y4
+	VPBLENDD     $0xf0, Y4, Y15, Y4
+	VPBROADCASTD 752(CX), Y15
+	VPBROADCASTD 756(CX), Y5
+	VPBLENDD     $0xf0, Y5, Y15, Y5
+	VPBROADCASTD 760(CX), Y15
+	VPBROADCASTD 764(CX), Y6
+	VPBLENDD     $0xf0, Y6, Y15, Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPERM2I128   $0x20, Y8, Y7, Y3
+	VPERM2I128   $0x31, Y8, Y7, Y8
+	VMOVDQA      Y3, Y7
+	VPERM2I128   $0x20, Y10, Y9, Y3
+	VPERM2I128   $0x31, Y10, Y9, Y10
+	VMOVDQA      Y3, Y9
+	VPERM2I128   $0x20, Y12, Y11, Y3
+	VPERM2I128   $0x31, Y12, Y11, Y12
+	VMOVDQA      Y3, Y11
+	VPERM2I128   $0x20, Y14, Y13, Y3
+	VPERM2I128   $0x31, Y14, Y13, Y14
+	VMOVDQA      Y3, Y13
+	VPBROADCASTD 880(CX), Y3
+	VPBROADCASTD 884(CX), Y4
+	VPBROADCASTD 888(CX), Y5
+	VPBROADCASTD 892(CX), Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPBROADCASTD 952(CX), Y3
+	VPBROADCASTD 956(CX), Y4
+	VPADDD       Y7, Y1, Y5
+	VPSUBD       Y9, Y5, Y5
+	VPADDD       Y7, Y9, Y7
+	VPMULUDQ     Y5, Y3, Y9
+	VPADDD       Y8, Y1, Y6
+	VPSUBD       Y10, Y6, Y6
+	VPADDD       Y8, Y10, Y8
+	VPMULUDQ     Y6, Y3, Y10
+	VPADDD       Y11, Y1, Y3
+	VPSUBD       Y13, Y3, Y3
+	VPADDD       Y11, Y13, Y11
+	VPMULUDQ     Y3, Y4, Y13
+	VPADDD       Y12, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y12, Y14, Y12
+	VPMULUDQ     Y15, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y5
+	VPMULUDQ     Y2, Y10, Y6
+	VPMULUDQ     Y2, Y13, Y3
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y5, Y9, Y5
+	VPADDQ       Y6, Y10, Y6
+	VPADDQ       Y3, Y13, Y3
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y5, Y9
+	VPSRLQ       $0x20, Y6, Y10
+	VPSRLQ       $0x20, Y3, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	VPBROADCASTD 988(CX), Y3
+	VPADDD       Y7, Y1, Y4
+	VPSUBD       Y11, Y4, Y4
+	VPADDD       Y7, Y11, Y7
+	VPMULUDQ     Y4, Y3, Y11
+	VPADDD       Y8, Y1, Y5
+	VPSUBD       Y12, Y5, Y5
+	VPADDD       Y8, Y12, Y8
+	VPMULUDQ     Y5, Y3, Y12
+	VPADDD       Y9, Y1, Y6
+	VPSUBD       Y13, Y6, Y6
+	VPADDD       Y9, Y13, Y9
+	VPMULUDQ     Y6, Y3, Y13
+	VPADDD       Y10, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y10, Y14, Y10
+	VPMULUDQ     Y15, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y13, Y6
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y4, Y11, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y13, Y6
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y4, Y11
+	VPSRLQ       $0x20, Y5, Y12
+	VPSRLQ       $0x20, Y6, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	VMOVDQA      Y7, 1792(DX)
+	VMOVDQA      Y8, 1824(DX)
+	VMOVDQA      Y9, 1856(DX)
+	VMOVDQA      Y10, 1888(DX)
+	VMOVDQA      Y11, 1920(DX)
+	VMOVDQA      Y12, 1952(DX)
+	VMOVDQA      Y13, 1984(DX)
+	VMOVDQA      Y14, 2016(DX)
+	VMOVDQA      (DX), Y7
+	VMOVDQA      256(DX), Y8
+	VMOVDQA      512(DX), Y9
+	VMOVDQA      768(DX), Y10
+	VMOVDQA      1024(DX), Y11
+	VMOVDQA      1280(DX), Y12
+	VMOVDQA      1536(DX), Y13
+	VMOVDQA      1792(DX), Y14
+	VPBROADCASTD 992(CX), Y3
+	VPBROADCASTD 996(CX), Y4
+	VPBROADCASTD 1000(CX), Y5
+	VPBROADCASTD 1004(CX), Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPBROADCASTD 1008(CX), Y3
+	VPBROADCASTD 1012(CX), Y4
+	VPADDD       Y7, Y1, Y5
+	VPSUBD       Y9, Y5, Y5
+	VPADDD       Y7, Y9, Y7
+	VPMULUDQ     Y5, Y3, Y9
+	VPADDD       Y8, Y1, Y6
+	VPSUBD       Y10, Y6, Y6
+	VPADDD       Y8, Y10, Y8
+	VPMULUDQ     Y6, Y3, Y10
+	VPADDD       Y11, Y1, Y3
+	VPSUBD       Y13, Y3, Y3
+	VPADDD       Y11, Y13, Y11
+	VPMULUDQ     Y3, Y4, Y13
+	VPADDD       Y12, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y12, Y14, Y12
+	VPMULUDQ     Y15, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y5
+	VPMULUDQ     Y2, Y10, Y6
+	VPMULUDQ     Y2, Y13, Y3
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y5, Y9, Y5
+	VPADDQ       Y6, Y10, Y6
+	VPADDQ       Y3, Y13, Y3
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y5, Y9
+	VPSRLQ       $0x20, Y6, Y10
+	VPSRLQ       $0x20, Y3, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	VPBROADCASTD 1016(CX), Y3
+	VPADDD       Y7, Y1, Y4
+	VPSUBD       Y11, Y4, Y4
+	VPADDD       Y7, Y11, Y7
+	VPMULUDQ     Y4, Y3, Y11
+	VPADDD       Y8, Y1, Y5
+	VPSUBD       Y12, Y5, Y5
+	VPADDD       Y8, Y12, Y8
+	VPMULUDQ     Y5, Y3, Y12
+	VPADDD       Y9, Y1, Y6
+	VPSUBD       Y13, Y6, Y6
+	VPADDD       Y9, Y13, Y9
+	VPMULUDQ     Y6, Y3, Y13
+	VPADDD       Y10, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y10, Y14, Y10
+	VPMULUDQ     Y15, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y13, Y6
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y4, Y11, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y13, Y6
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y4, Y11
+	VPSRLQ       $0x20, Y5, Y12
+	VPSRLQ       $0x20, Y6, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	MOVL         $0x0000a3fa, BX
+	VMOVD        BX, X3
+	VPBROADCASTD X3, Y3
+	VPMULUDQ     Y7, Y3, Y7
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y9, Y3, Y9
+	VPMULUDQ     Y10, Y3, Y10
+	VPMULUDQ     Y11, Y3, Y11
+	VPMULUDQ     Y12, Y3, Y12
+	VPMULUDQ     Y13, Y3, Y13
+	VPMULUDQ     Y14, Y3, Y14
+	VPMULUDQ     Y2, Y7, Y3
+	VPMULUDQ     Y2, Y8, Y4
+	VPMULUDQ     Y2, Y9, Y5
+	VPMULUDQ     Y2, Y10, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y7, Y3
+	VPADDQ       Y4, Y8, Y4
+	VPADDQ       Y5, Y9, Y5
+	VPADDQ       Y6, Y10, Y6
+	VPSRLQ       $0x20, Y3, Y7
+	VPSRLQ       $0x20, Y4, Y8
+	VPSRLQ       $0x20, Y5, Y9
+	VPSRLQ       $0x20, Y6, Y10
+	VPMULUDQ     Y2, Y11, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y11, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y11
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y13
+	VPSRLQ       $0x20, Y6, Y14
+	VMOVDQA      Y7, (DX)
+	VMOVDQA      Y8, 256(DX)
+	VMOVDQA      Y9, 512(DX)
+	VMOVDQA      Y10, 768(DX)
+	VMOVDQA      Y11, 1024(DX)
+	VMOVDQA      Y12, 1280(DX)
+	VMOVDQA      Y13, 1536(DX)
+	VMOVDQA      Y14, 1792(DX)
+	VMOVDQA      32(DX), Y7
+	VMOVDQA      288(DX), Y8
+	VMOVDQA      544(DX), Y9
+	VMOVDQA      800(DX), Y10
+	VMOVDQA      1056(DX), Y11
+	VMOVDQA      1312(DX), Y12
+	VMOVDQA      1568(DX), Y13
+	VMOVDQA      1824(DX), Y14
+	VPBROADCASTD 992(CX), Y3
+	VPBROADCASTD 996(CX), Y4
+	VPBROADCASTD 1000(CX), Y5
+	VPBROADCASTD 1004(CX), Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPBROADCASTD 1008(CX), Y3
+	VPBROADCASTD 1012(CX), Y4
+	VPADDD       Y7, Y1, Y5
+	VPSUBD       Y9, Y5, Y5
+	VPADDD       Y7, Y9, Y7
+	VPMULUDQ     Y5, Y3, Y9
+	VPADDD       Y8, Y1, Y6
+	VPSUBD       Y10, Y6, Y6
+	VPADDD       Y8, Y10, Y8
+	VPMULUDQ     Y6, Y3, Y10
+	VPADDD       Y11, Y1, Y3
+	VPSUBD       Y13, Y3, Y3
+	VPADDD       Y11, Y13, Y11
+	VPMULUDQ     Y3, Y4, Y13
+	VPADDD       Y12, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y12, Y14, Y12
+	VPMULUDQ     Y15, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y5
+	VPMULUDQ     Y2, Y10, Y6
+	VPMULUDQ     Y2, Y13, Y3
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y5, Y9, Y5
+	VPADDQ       Y6, Y10, Y6
+	VPADDQ       Y3, Y13, Y3
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y5, Y9
+	VPSRLQ       $0x20, Y6, Y10
+	VPSRLQ       $0x20, Y3, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	VPBROADCASTD 1016(CX), Y3
+	VPADDD       Y7, Y1, Y4
+	VPSUBD       Y11, Y4, Y4
+	VPADDD       Y7, Y11, Y7
+	VPMULUDQ     Y4, Y3, Y11
+	VPADDD       Y8, Y1, Y5
+	VPSUBD       Y12, Y5, Y5
+	VPADDD       Y8, Y12, Y8
+	VPMULUDQ     Y5, Y3, Y12
+	VPADDD       Y9, Y1, Y6
+	VPSUBD       Y13, Y6, Y6
+	VPADDD       Y9, Y13, Y9
+	VPMULUDQ     Y6, Y3, Y13
+	VPADDD       Y10, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y10, Y14, Y10
+	VPMULUDQ     Y15, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y13, Y6
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y4, Y11, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y13, Y6
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y4, Y11
+	VPSRLQ       $0x20, Y5, Y12
+	VPSRLQ       $0x20, Y6, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	MOVL         $0x0000a3fa, BX
+	VMOVD        BX, X3
+	VPBROADCASTD X3, Y3
+	VPMULUDQ     Y7, Y3, Y7
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y9, Y3, Y9
+	VPMULUDQ     Y10, Y3, Y10
+	VPMULUDQ     Y11, Y3, Y11
+	VPMULUDQ     Y12, Y3, Y12
+	VPMULUDQ     Y13, Y3, Y13
+	VPMULUDQ     Y14, Y3, Y14
+	VPMULUDQ     Y2, Y7, Y3
+	VPMULUDQ     Y2, Y8, Y4
+	VPMULUDQ     Y2, Y9, Y5
+	VPMULUDQ     Y2, Y10, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y7, Y3
+	VPADDQ       Y4, Y8, Y4
+	VPADDQ       Y5, Y9, Y5
+	VPADDQ       Y6, Y10, Y6
+	VPSRLQ       $0x20, Y3, Y7
+	VPSRLQ       $0x20, Y4, Y8
+	VPSRLQ       $0x20, Y5, Y9
+	VPSRLQ       $0x20, Y6, Y10
+	VPMULUDQ     Y2, Y11, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y11, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y11
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y13
+	VPSRLQ       $0x20, Y6, Y14
+	VMOVDQA      Y7, 32(DX)
+	VMOVDQA      Y8, 288(DX)
+	VMOVDQA      Y9, 544(DX)
+	VMOVDQA      Y10, 800(DX)
+	VMOVDQA      Y11, 1056(DX)
+	VMOVDQA      Y12, 1312(DX)
+	VMOVDQA      Y13, 1568(DX)
+	VMOVDQA      Y14, 1824(DX)
+	VMOVDQA      64(DX), Y7
+	VMOVDQA      320(DX), Y8
+	VMOVDQA      576(DX), Y9
+	VMOVDQA      832(DX), Y10
+	VMOVDQA      1088(DX), Y11
+	VMOVDQA      1344(DX), Y12
+	VMOVDQA      1600(DX), Y13
+	VMOVDQA      1856(DX), Y14
+	VPBROADCASTD 992(CX), Y3
+	VPBROADCASTD 996(CX), Y4
+	VPBROADCASTD 1000(CX), Y5
+	VPBROADCASTD 1004(CX), Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPBROADCASTD 1008(CX), Y3
+	VPBROADCASTD 1012(CX), Y4
+	VPADDD       Y7, Y1, Y5
+	VPSUBD       Y9, Y5, Y5
+	VPADDD       Y7, Y9, Y7
+	VPMULUDQ     Y5, Y3, Y9
+	VPADDD       Y8, Y1, Y6
+	VPSUBD       Y10, Y6, Y6
+	VPADDD       Y8, Y10, Y8
+	VPMULUDQ     Y6, Y3, Y10
+	VPADDD       Y11, Y1, Y3
+	VPSUBD       Y13, Y3, Y3
+	VPADDD       Y11, Y13, Y11
+	VPMULUDQ     Y3, Y4, Y13
+	VPADDD       Y12, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y12, Y14, Y12
+	VPMULUDQ     Y15, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y5
+	VPMULUDQ     Y2, Y10, Y6
+	VPMULUDQ     Y2, Y13, Y3
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y5, Y9, Y5
+	VPADDQ       Y6, Y10, Y6
+	VPADDQ       Y3, Y13, Y3
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y5, Y9
+	VPSRLQ       $0x20, Y6, Y10
+	VPSRLQ       $0x20, Y3, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	VPBROADCASTD 1016(CX), Y3
+	VPADDD       Y7, Y1, Y4
+	VPSUBD       Y11, Y4, Y4
+	VPADDD       Y7, Y11, Y7
+	VPMULUDQ     Y4, Y3, Y11
+	VPADDD       Y8, Y1, Y5
+	VPSUBD       Y12, Y5, Y5
+	VPADDD       Y8, Y12, Y8
+	VPMULUDQ     Y5, Y3, Y12
+	VPADDD       Y9, Y1, Y6
+	VPSUBD       Y13, Y6, Y6
+	VPADDD       Y9, Y13, Y9
+	VPMULUDQ     Y6, Y3, Y13
+	VPADDD       Y10, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y10, Y14, Y10
+	VPMULUDQ     Y15, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y13, Y6
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y4, Y11, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y13, Y6
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y4, Y11
+	VPSRLQ       $0x20, Y5, Y12
+	VPSRLQ       $0x20, Y6, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	MOVL         $0x0000a3fa, BX
+	VMOVD        BX, X3
+	VPBROADCASTD X3, Y3
+	VPMULUDQ     Y7, Y3, Y7
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y9, Y3, Y9
+	VPMULUDQ     Y10, Y3, Y10
+	VPMULUDQ     Y11, Y3, Y11
+	VPMULUDQ     Y12, Y3, Y12
+	VPMULUDQ     Y13, Y3, Y13
+	VPMULUDQ     Y14, Y3, Y14
+	VPMULUDQ     Y2, Y7, Y3
+	VPMULUDQ     Y2, Y8, Y4
+	VPMULUDQ     Y2, Y9, Y5
+	VPMULUDQ     Y2, Y10, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y7, Y3
+	VPADDQ       Y4, Y8, Y4
+	VPADDQ       Y5, Y9, Y5
+	VPADDQ       Y6, Y10, Y6
+	VPSRLQ       $0x20, Y3, Y7
+	VPSRLQ       $0x20, Y4, Y8
+	VPSRLQ       $0x20, Y5, Y9
+	VPSRLQ       $0x20, Y6, Y10
+	VPMULUDQ     Y2, Y11, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y11, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y11
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y13
+	VPSRLQ       $0x20, Y6, Y14
+	VMOVDQA      Y7, 64(DX)
+	VMOVDQA      Y8, 320(DX)
+	VMOVDQA      Y9, 576(DX)
+	VMOVDQA      Y10, 832(DX)
+	VMOVDQA      Y11, 1088(DX)
+	VMOVDQA      Y12, 1344(DX)
+	VMOVDQA      Y13, 1600(DX)
+	VMOVDQA      Y14, 1856(DX)
+	VMOVDQA      96(DX), Y7
+	VMOVDQA      352(DX), Y8
+	VMOVDQA      608(DX), Y9
+	VMOVDQA      864(DX), Y10
+	VMOVDQA      1120(DX), Y11
+	VMOVDQA      1376(DX), Y12
+	VMOVDQA      1632(DX), Y13
+	VMOVDQA      1888(DX), Y14
+	VPBROADCASTD 992(CX), Y3
+	VPBROADCASTD 996(CX), Y4
+	VPBROADCASTD 1000(CX), Y5
+	VPBROADCASTD 1004(CX), Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPBROADCASTD 1008(CX), Y3
+	VPBROADCASTD 1012(CX), Y4
+	VPADDD       Y7, Y1, Y5
+	VPSUBD       Y9, Y5, Y5
+	VPADDD       Y7, Y9, Y7
+	VPMULUDQ     Y5, Y3, Y9
+	VPADDD       Y8, Y1, Y6
+	VPSUBD       Y10, Y6, Y6
+	VPADDD       Y8, Y10, Y8
+	VPMULUDQ     Y6, Y3, Y10
+	VPADDD       Y11, Y1, Y3
+	VPSUBD       Y13, Y3, Y3
+	VPADDD       Y11, Y13, Y11
+	VPMULUDQ     Y3, Y4, Y13
+	VPADDD       Y12, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y12, Y14, Y12
+	VPMULUDQ     Y15, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y5
+	VPMULUDQ     Y2, Y10, Y6
+	VPMULUDQ     Y2, Y13, Y3
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y5, Y9, Y5
+	VPADDQ       Y6, Y10, Y6
+	VPADDQ       Y3, Y13, Y3
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y5, Y9
+	VPSRLQ       $0x20, Y6, Y10
+	VPSRLQ       $0x20, Y3, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	VPBROADCASTD 1016(CX), Y3
+	VPADDD       Y7, Y1, Y4
+	VPSUBD       Y11, Y4, Y4
+	VPADDD       Y7, Y11, Y7
+	VPMULUDQ     Y4, Y3, Y11
+	VPADDD       Y8, Y1, Y5
+	VPSUBD       Y12, Y5, Y5
+	VPADDD       Y8, Y12, Y8
+	VPMULUDQ     Y5, Y3, Y12
+	VPADDD       Y9, Y1, Y6
+	VPSUBD       Y13, Y6, Y6
+	VPADDD       Y9, Y13, Y9
+	VPMULUDQ     Y6, Y3, Y13
+	VPADDD       Y10, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y10, Y14, Y10
+	VPMULUDQ     Y15, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y13, Y6
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y4, Y11, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y13, Y6
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y4, Y11
+	VPSRLQ       $0x20, Y5, Y12
+	VPSRLQ       $0x20, Y6, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	MOVL         $0x0000a3fa, BX
+	VMOVD        BX, X3
+	VPBROADCASTD X3, Y3
+	VPMULUDQ     Y7, Y3, Y7
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y9, Y3, Y9
+	VPMULUDQ     Y10, Y3, Y10
+	VPMULUDQ     Y11, Y3, Y11
+	VPMULUDQ     Y12, Y3, Y12
+	VPMULUDQ     Y13, Y3, Y13
+	VPMULUDQ     Y14, Y3, Y14
+	VPMULUDQ     Y2, Y7, Y3
+	VPMULUDQ     Y2, Y8, Y4
+	VPMULUDQ     Y2, Y9, Y5
+	VPMULUDQ     Y2, Y10, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y7, Y3
+	VPADDQ       Y4, Y8, Y4
+	VPADDQ       Y5, Y9, Y5
+	VPADDQ       Y6, Y10, Y6
+	VPSRLQ       $0x20, Y3, Y7
+	VPSRLQ       $0x20, Y4, Y8
+	VPSRLQ       $0x20, Y5, Y9
+	VPSRLQ       $0x20, Y6, Y10
+	VPMULUDQ     Y2, Y11, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y11, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y11
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y13
+	VPSRLQ       $0x20, Y6, Y14
+	VMOVDQA      Y7, 96(DX)
+	VMOVDQA      Y8, 352(DX)
+	VMOVDQA      Y9, 608(DX)
+	VMOVDQA      Y10, 864(DX)
+	VMOVDQA      Y11, 1120(DX)
+	VMOVDQA      Y12, 1376(DX)
+	VMOVDQA      Y13, 1632(DX)
+	VMOVDQA      Y14, 1888(DX)
+	VMOVDQA      128(DX), Y7
+	VMOVDQA      384(DX), Y8
+	VMOVDQA      640(DX), Y9
+	VMOVDQA      896(DX), Y10
+	VMOVDQA      1152(DX), Y11
+	VMOVDQA      1408(DX), Y12
+	VMOVDQA      1664(DX), Y13
+	VMOVDQA      1920(DX), Y14
+	VPBROADCASTD 992(CX), Y3
+	VPBROADCASTD 996(CX), Y4
+	VPBROADCASTD 1000(CX), Y5
+	VPBROADCASTD 1004(CX), Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPBROADCASTD 1008(CX), Y3
+	VPBROADCASTD 1012(CX), Y4
+	VPADDD       Y7, Y1, Y5
+	VPSUBD       Y9, Y5, Y5
+	VPADDD       Y7, Y9, Y7
+	VPMULUDQ     Y5, Y3, Y9
+	VPADDD       Y8, Y1, Y6
+	VPSUBD       Y10, Y6, Y6
+	VPADDD       Y8, Y10, Y8
+	VPMULUDQ     Y6, Y3, Y10
+	VPADDD       Y11, Y1, Y3
+	VPSUBD       Y13, Y3, Y3
+	VPADDD       Y11, Y13, Y11
+	VPMULUDQ     Y3, Y4, Y13
+	VPADDD       Y12, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y12, Y14, Y12
+	VPMULUDQ     Y15, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y5
+	VPMULUDQ     Y2, Y10, Y6
+	VPMULUDQ     Y2, Y13, Y3
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y5, Y9, Y5
+	VPADDQ       Y6, Y10, Y6
+	VPADDQ       Y3, Y13, Y3
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y5, Y9
+	VPSRLQ       $0x20, Y6, Y10
+	VPSRLQ       $0x20, Y3, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	VPBROADCASTD 1016(CX), Y3
+	VPADDD       Y7, Y1, Y4
+	VPSUBD       Y11, Y4, Y4
+	VPADDD       Y7, Y11, Y7
+	VPMULUDQ     Y4, Y3, Y11
+	VPADDD       Y8, Y1, Y5
+	VPSUBD       Y12, Y5, Y5
+	VPADDD       Y8, Y12, Y8
+	VPMULUDQ     Y5, Y3, Y12
+	VPADDD       Y9, Y1, Y6
+	VPSUBD       Y13, Y6, Y6
+	VPADDD       Y9, Y13, Y9
+	VPMULUDQ     Y6, Y3, Y13
+	VPADDD       Y10, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y10, Y14, Y10
+	VPMULUDQ     Y15, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y13, Y6
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y4, Y11, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y13, Y6
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y4, Y11
+	VPSRLQ       $0x20, Y5, Y12
+	VPSRLQ       $0x20, Y6, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	MOVL         $0x0000a3fa, BX
+	VMOVD        BX, X3
+	VPBROADCASTD X3, Y3
+	VPMULUDQ     Y7, Y3, Y7
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y9, Y3, Y9
+	VPMULUDQ     Y10, Y3, Y10
+	VPMULUDQ     Y11, Y3, Y11
+	VPMULUDQ     Y12, Y3, Y12
+	VPMULUDQ     Y13, Y3, Y13
+	VPMULUDQ     Y14, Y3, Y14
+	VPMULUDQ     Y2, Y7, Y3
+	VPMULUDQ     Y2, Y8, Y4
+	VPMULUDQ     Y2, Y9, Y5
+	VPMULUDQ     Y2, Y10, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y7, Y3
+	VPADDQ       Y4, Y8, Y4
+	VPADDQ       Y5, Y9, Y5
+	VPADDQ       Y6, Y10, Y6
+	VPSRLQ       $0x20, Y3, Y7
+	VPSRLQ       $0x20, Y4, Y8
+	VPSRLQ       $0x20, Y5, Y9
+	VPSRLQ       $0x20, Y6, Y10
+	VPMULUDQ     Y2, Y11, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y11, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y11
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y13
+	VPSRLQ       $0x20, Y6, Y14
+	VMOVDQA      Y7, 128(DX)
+	VMOVDQA      Y8, 384(DX)
+	VMOVDQA      Y9, 640(DX)
+	VMOVDQA      Y10, 896(DX)
+	VMOVDQA      Y11, 1152(DX)
+	VMOVDQA      Y12, 1408(DX)
+	VMOVDQA      Y13, 1664(DX)
+	VMOVDQA      Y14, 1920(DX)
+	VMOVDQA      160(DX), Y7
+	VMOVDQA      416(DX), Y8
+	VMOVDQA      672(DX), Y9
+	VMOVDQA      928(DX), Y10
+	VMOVDQA      1184(DX), Y11
+	VMOVDQA      1440(DX), Y12
+	VMOVDQA      1696(DX), Y13
+	VMOVDQA      1952(DX), Y14
+	VPBROADCASTD 992(CX), Y3
+	VPBROADCASTD 996(CX), Y4
+	VPBROADCASTD 1000(CX), Y5
+	VPBROADCASTD 1004(CX), Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPBROADCASTD 1008(CX), Y3
+	VPBROADCASTD 1012(CX), Y4
+	VPADDD       Y7, Y1, Y5
+	VPSUBD       Y9, Y5, Y5
+	VPADDD       Y7, Y9, Y7
+	VPMULUDQ     Y5, Y3, Y9
+	VPADDD       Y8, Y1, Y6
+	VPSUBD       Y10, Y6, Y6
+	VPADDD       Y8, Y10, Y8
+	VPMULUDQ     Y6, Y3, Y10
+	VPADDD       Y11, Y1, Y3
+	VPSUBD       Y13, Y3, Y3
+	VPADDD       Y11, Y13, Y11
+	VPMULUDQ     Y3, Y4, Y13
+	VPADDD       Y12, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y12, Y14, Y12
+	VPMULUDQ     Y15, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y5
+	VPMULUDQ     Y2, Y10, Y6
+	VPMULUDQ     Y2, Y13, Y3
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y5, Y9, Y5
+	VPADDQ       Y6, Y10, Y6
+	VPADDQ       Y3, Y13, Y3
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y5, Y9
+	VPSRLQ       $0x20, Y6, Y10
+	VPSRLQ       $0x20, Y3, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	VPBROADCASTD 1016(CX), Y3
+	VPADDD       Y7, Y1, Y4
+	VPSUBD       Y11, Y4, Y4
+	VPADDD       Y7, Y11, Y7
+	VPMULUDQ     Y4, Y3, Y11
+	VPADDD       Y8, Y1, Y5
+	VPSUBD       Y12, Y5, Y5
+	VPADDD       Y8, Y12, Y8
+	VPMULUDQ     Y5, Y3, Y12
+	VPADDD       Y9, Y1, Y6
+	VPSUBD       Y13, Y6, Y6
+	VPADDD       Y9, Y13, Y9
+	VPMULUDQ     Y6, Y3, Y13
+	VPADDD       Y10, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y10, Y14, Y10
+	VPMULUDQ     Y15, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y13, Y6
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y4, Y11, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y13, Y6
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y4, Y11
+	VPSRLQ       $0x20, Y5, Y12
+	VPSRLQ       $0x20, Y6, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	MOVL         $0x0000a3fa, BX
+	VMOVD        BX, X3
+	VPBROADCASTD X3, Y3
+	VPMULUDQ     Y7, Y3, Y7
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y9, Y3, Y9
+	VPMULUDQ     Y10, Y3, Y10
+	VPMULUDQ     Y11, Y3, Y11
+	VPMULUDQ     Y12, Y3, Y12
+	VPMULUDQ     Y13, Y3, Y13
+	VPMULUDQ     Y14, Y3, Y14
+	VPMULUDQ     Y2, Y7, Y3
+	VPMULUDQ     Y2, Y8, Y4
+	VPMULUDQ     Y2, Y9, Y5
+	VPMULUDQ     Y2, Y10, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y7, Y3
+	VPADDQ       Y4, Y8, Y4
+	VPADDQ       Y5, Y9, Y5
+	VPADDQ       Y6, Y10, Y6
+	VPSRLQ       $0x20, Y3, Y7
+	VPSRLQ       $0x20, Y4, Y8
+	VPSRLQ       $0x20, Y5, Y9
+	VPSRLQ       $0x20, Y6, Y10
+	VPMULUDQ     Y2, Y11, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y11, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y11
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y13
+	VPSRLQ       $0x20, Y6, Y14
+	VMOVDQA      Y7, 160(DX)
+	VMOVDQA      Y8, 416(DX)
+	VMOVDQA      Y9, 672(DX)
+	VMOVDQA      Y10, 928(DX)
+	VMOVDQA      Y11, 1184(DX)
+	VMOVDQA      Y12, 1440(DX)
+	VMOVDQA      Y13, 1696(DX)
+	VMOVDQA      Y14, 1952(DX)
+	VMOVDQA      192(DX), Y7
+	VMOVDQA      448(DX), Y8
+	VMOVDQA      704(DX), Y9
+	VMOVDQA      960(DX), Y10
+	VMOVDQA      1216(DX), Y11
+	VMOVDQA      1472(DX), Y12
+	VMOVDQA      1728(DX), Y13
+	VMOVDQA      1984(DX), Y14
+	VPBROADCASTD 992(CX), Y3
+	VPBROADCASTD 996(CX), Y4
+	VPBROADCASTD 1000(CX), Y5
+	VPBROADCASTD 1004(CX), Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPBROADCASTD 1008(CX), Y3
+	VPBROADCASTD 1012(CX), Y4
+	VPADDD       Y7, Y1, Y5
+	VPSUBD       Y9, Y5, Y5
+	VPADDD       Y7, Y9, Y7
+	VPMULUDQ     Y5, Y3, Y9
+	VPADDD       Y8, Y1, Y6
+	VPSUBD       Y10, Y6, Y6
+	VPADDD       Y8, Y10, Y8
+	VPMULUDQ     Y6, Y3, Y10
+	VPADDD       Y11, Y1, Y3
+	VPSUBD       Y13, Y3, Y3
+	VPADDD       Y11, Y13, Y11
+	VPMULUDQ     Y3, Y4, Y13
+	VPADDD       Y12, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y12, Y14, Y12
+	VPMULUDQ     Y15, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y5
+	VPMULUDQ     Y2, Y10, Y6
+	VPMULUDQ     Y2, Y13, Y3
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y5, Y9, Y5
+	VPADDQ       Y6, Y10, Y6
+	VPADDQ       Y3, Y13, Y3
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y5, Y9
+	VPSRLQ       $0x20, Y6, Y10
+	VPSRLQ       $0x20, Y3, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	VPBROADCASTD 1016(CX), Y3
+	VPADDD       Y7, Y1, Y4
+	VPSUBD       Y11, Y4, Y4
+	VPADDD       Y7, Y11, Y7
+	VPMULUDQ     Y4, Y3, Y11
+	VPADDD       Y8, Y1, Y5
+	VPSUBD       Y12, Y5, Y5
+	VPADDD       Y8, Y12, Y8
+	VPMULUDQ     Y5, Y3, Y12
+	VPADDD       Y9, Y1, Y6
+	VPSUBD       Y13, Y6, Y6
+	VPADDD       Y9, Y13, Y9
+	VPMULUDQ     Y6, Y3, Y13
+	VPADDD       Y10, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y10, Y14, Y10
+	VPMULUDQ     Y15, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y13, Y6
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y4, Y11, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y13, Y6
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y4, Y11
+	VPSRLQ       $0x20, Y5, Y12
+	VPSRLQ       $0x20, Y6, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	MOVL         $0x0000a3fa, BX
+	VMOVD        BX, X3
+	VPBROADCASTD X3, Y3
+	VPMULUDQ     Y7, Y3, Y7
+	VPMULUDQ     Y8, Y3, Y8
+	VPMULUDQ     Y9, Y3, Y9
+	VPMULUDQ     Y10, Y3, Y10
+	VPMULUDQ     Y11, Y3, Y11
+	VPMULUDQ     Y12, Y3, Y12
+	VPMULUDQ     Y13, Y3, Y13
+	VPMULUDQ     Y14, Y3, Y14
+	VPMULUDQ     Y2, Y7, Y3
+	VPMULUDQ     Y2, Y8, Y4
+	VPMULUDQ     Y2, Y9, Y5
+	VPMULUDQ     Y2, Y10, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y7, Y3
+	VPADDQ       Y4, Y8, Y4
+	VPADDQ       Y5, Y9, Y5
+	VPADDQ       Y6, Y10, Y6
+	VPSRLQ       $0x20, Y3, Y7
+	VPSRLQ       $0x20, Y4, Y8
+	VPSRLQ       $0x20, Y5, Y9
+	VPSRLQ       $0x20, Y6, Y10
+	VPMULUDQ     Y2, Y11, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y13, Y5
+	VPMULUDQ     Y2, Y14, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPADDQ       Y3, Y11, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y13, Y5
+	VPADDQ       Y6, Y14, Y6
+	VPSRLQ       $0x20, Y3, Y11
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y13
+	VPSRLQ       $0x20, Y6, Y14
+	VMOVDQA      Y7, 192(DX)
+	VMOVDQA      Y8, 448(DX)
+	VMOVDQA      Y9, 704(DX)
+	VMOVDQA      Y10, 960(DX)
+	VMOVDQA      Y11, 1216(DX)
+	VMOVDQA      Y12, 1472(DX)
+	VMOVDQA      Y13, 1728(DX)
+	VMOVDQA      Y14, 1984(DX)
+	VMOVDQA      224(DX), Y7
+	VMOVDQA      480(DX), Y8
+	VMOVDQA      736(DX), Y9
+	VMOVDQA      992(DX), Y10
+	VMOVDQA      1248(DX), Y11
+	VMOVDQA      1504(DX), Y12
+	VMOVDQA      1760(DX), Y13
+	VMOVDQA      2016(DX), Y14
+	VPBROADCASTD 992(CX), Y3
+	VPBROADCASTD 996(CX), Y4
+	VPBROADCASTD 1000(CX), Y5
+	VPBROADCASTD 1004(CX), Y6
+	VPADDD       Y7, Y1, Y15
+	VPSUBD       Y8, Y15, Y15
+	VPADDD       Y7, Y8, Y7
+	VPMULUDQ     Y15, Y3, Y8
+	VPADDD       Y9, Y1, Y3
+	VPSUBD       Y10, Y3, Y3
+	VPADDD       Y9, Y10, Y9
+	VPMULUDQ     Y3, Y4, Y10
+	VPADDD       Y11, Y1, Y4
+	VPSUBD       Y12, Y4, Y4
+	VPADDD       Y11, Y12, Y11
+	VPMULUDQ     Y4, Y5, Y12
+	VPADDD       Y13, Y1, Y5
+	VPSUBD       Y14, Y5, Y5
+	VPADDD       Y13, Y14, Y13
+	VPMULUDQ     Y5, Y6, Y14
+	VPMULUDQ     Y2, Y8, Y15
+	VPMULUDQ     Y2, Y10, Y3
+	VPMULUDQ     Y2, Y12, Y4
+	VPMULUDQ     Y2, Y14, Y5
+	VPMULUDQ     Y0, Y15, Y15
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y15, Y8, Y15
+	VPADDQ       Y3, Y10, Y3
+	VPADDQ       Y4, Y12, Y4
+	VPADDQ       Y5, Y14, Y5
+	VPSRLQ       $0x20, Y15, Y8
+	VPSRLQ       $0x20, Y3, Y10
+	VPSRLQ       $0x20, Y4, Y12
+	VPSRLQ       $0x20, Y5, Y14
+	VPBROADCASTD 1008(CX), Y3
+	VPBROADCASTD 1012(CX), Y4
+	VPADDD       Y7, Y1, Y5
+	VPSUBD       Y9, Y5, Y5
+	VPADDD       Y7, Y9, Y7
+	VPMULUDQ     Y5, Y3, Y9
+	VPADDD       Y8, Y1, Y6
+	VPSUBD       Y10, Y6, Y6
+	VPADDD       Y8, Y10, Y8
+	VPMULUDQ     Y6, Y3, Y10
+	VPADDD       Y11, Y1, Y3
+	VPSUBD       Y13, Y3, Y3
+	VPADDD       Y11, Y13, Y11
+	VPMULUDQ     Y3, Y4, Y13
+	VPADDD       Y12, Y1, Y15
+	VPSUBD       Y14, Y15, Y15
+	VPADDD       Y12, Y14, Y12
+	VPMULUDQ     Y15, Y4, Y14
+	VPMULUDQ     Y2, Y9, Y5
+	VPMULUDQ     Y2, Y10, Y6
+	VPMULUDQ     Y2, Y13, Y3
+	VPMULUDQ     Y2, Y14, Y15
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y15, Y15
+	VPADDQ       Y5, Y9, Y5
+	VPADDQ       Y6, Y10, Y6
+	VPADDQ       Y3, Y13, Y3
+	VPADDQ       Y15, Y14, Y15
+	VPSRLQ       $0x20, Y5, Y9
+	VPSRLQ       $0x20, Y6, Y10
+	VPSRLQ       $0x20, Y3, Y13
+	VPSRLQ       $0x20, Y15, Y14
+	VPBROADCASTD 1016(CX), Y3
+	VPADDD       Y7, Y1, Y4
+	VPSUBD       Y11, Y4, Y4
+	VPADDD       Y7, Y11, Y7
+	VPMULUDQ     Y4, Y3, Y11
+	VPADDD       Y8, Y1, Y5
+	VPSUBD       Y12, Y5, Y5
+	VPADDD       Y8, Y12, Y8
+	VPMULUDQ     Y5, Y3, Y12
+	VPADDD       Y9, Y1, Y6
+	VPSUBD       Y13, Y6, Y6
+	VPADDD       Y9, Y13, Y9
+	VPMULUDQ     Y6, Y3, Y13
+	VPADDD       Y10, Y1, Y1
+	VPSUBD       Y14, Y1, Y1
+	VPADDD       Y10, Y14, Y10
+	VPMULUDQ     Y1, Y3, Y14
+	VPMULUDQ     Y2, Y11, Y4
+	VPMULUDQ     Y2, Y12, Y5
+	VPMULUDQ     Y2, Y13, Y6
+	VPMULUDQ     Y2, Y14, Y1
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y1, Y1
+	VPADDQ       Y4, Y11, Y4
+	VPADDQ       Y5, Y12, Y5
+	VPADDQ       Y6, Y13, Y6
+	VPADDQ       Y1, Y14, Y1
+	VPSRLQ       $0x20, Y4, Y11
+	VPSRLQ       $0x20, Y5, Y12
+	VPSRLQ       $0x20, Y6, Y13
+	VPSRLQ       $0x20, Y1, Y14
+	MOVL         $0x0000a3fa, CX
+	VMOVD        CX, X1
+	VPBROADCASTD X1, Y1
+	VPMULUDQ     Y7, Y1, Y7
+	VPMULUDQ     Y8, Y1, Y8
+	VPMULUDQ     Y9, Y1, Y9
+	VPMULUDQ     Y10, Y1, Y10
+	VPMULUDQ     Y11, Y1, Y11
+	VPMULUDQ     Y12, Y1, Y12
+	VPMULUDQ     Y13, Y1, Y13
+	VPMULUDQ     Y14, Y1, Y14
+	VPMULUDQ     Y2, Y7, Y1
+	VPMULUDQ     Y2, Y8, Y3
+	VPMULUDQ     Y2, Y9, Y4
+	VPMULUDQ     Y2, Y10, Y5
+	VPMULUDQ     Y0, Y1, Y1
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y5, Y5
+	VPADDQ       Y1, Y7, Y1
+	VPADDQ       Y3, Y8, Y3
+	VPADDQ       Y4, Y9, Y4
+	VPADDQ       Y5, Y10, Y5
+	VPSRLQ       $0x20, Y1, Y7
+	VPSRLQ       $0x20, Y3, Y8
+	VPSRLQ       $0x20, Y4, Y9
+	VPSRLQ       $0x20, Y5, Y10
+	VPMULUDQ     Y2, Y11, Y1
+	VPMULUDQ     Y2, Y12, Y3
+	VPMULUDQ     Y2, Y13, Y4
+	VPMULUDQ     Y2, Y14, Y2
+	VPMULUDQ     Y0, Y1, Y1
+	VPMULUDQ     Y0, Y3, Y3
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y2, Y2
+	VPADDQ       Y1, Y11, Y1
+	VPADDQ       Y3, Y12, Y3
+	VPADDQ       Y4, Y13, Y4
+	VPADDQ       Y2, Y14, Y2
+	VPSRLQ       $0x20, Y1, Y11
+	VPSRLQ       $0x20, Y3, Y12
+	VPSRLQ       $0x20, Y4, Y13
+	VPSRLQ       $0x20, Y2, Y14
+	VMOVDQA      Y7, 224(DX)
+	VMOVDQA      Y8, 480(DX)
+	VMOVDQA      Y9, 736(DX)
+	VMOVDQA      Y10, 992(DX)
+	VMOVDQA      Y11, 1248(DX)
+	VMOVDQA      Y12, 1504(DX)
+	VMOVDQA      Y13, 1760(DX)
+	VMOVDQA      Y14, 2016(DX)
+	VMOVDQA      (DX), Y7
+	VMOVDQA      32(DX), Y8
+	VMOVDQA      64(DX), Y9
+	VMOVDQA      96(DX), Y10
+	VMOVDQA      128(DX), Y11
+	VMOVDQA      160(DX), Y12
+	VMOVDQA      192(DX), Y13
+	VMOVDQA      224(DX), Y14
+	VPERM2I128   $0x20, Y8, Y7, Y0
+	VPERM2I128   $0x31, Y8, Y7, Y8
+	VMOVDQA      Y0, Y7
+	VPERM2I128   $0x20, Y10, Y9, Y0
+	VPERM2I128   $0x31, Y10, Y9, Y10
+	VMOVDQA      Y0, Y9
+	VPERM2I128   $0x20, Y12, Y11, Y0
+	VPERM2I128   $0x31, Y12, Y11, Y12
+	VMOVDQA      Y0, Y11
+	VPERM2I128   $0x20, Y14, Y13, Y0
+	VPERM2I128   $0x31, Y14, Y13, Y14
+	VMOVDQA      Y0, Y13
+	VPUNPCKLQDQ  Y8, Y7, Y0
+	VPUNPCKHQDQ  Y8, Y7, Y8
+	VMOVDQA      Y0, Y7
+	VPUNPCKLQDQ  Y10, Y9, Y0
+	VPUNPCKHQDQ  Y10, Y9, Y10
+	VMOVDQA      Y0, Y9
+	VPUNPCKLQDQ  Y12, Y11, Y0
+	VPUNPCKHQDQ  Y12, Y11, Y12
+	VMOVDQA      Y0, Y11
+	VPUNPCKLQDQ  Y14, Y13, Y0
+	VPUNPCKHQDQ  Y14, Y13, Y14
+	VMOVDQA      Y0, Y13
+	VPSLLQ       $0x20, Y8, Y8
+	VPSLLQ       $0x20, Y10, Y10
+	VPSLLQ       $0x20, Y12, Y12
+	VPSLLQ       $0x20, Y14, Y14
+	VPBLENDD     $0xaa, Y8, Y7, Y7
+	VPBLENDD     $0xaa, Y10, Y9, Y9
+	VPBLENDD     $0xaa, Y12, Y11, Y11
+	VPBLENDD     $0xaa, Y14, Y13, Y13
+	VMOVDQU      Y7, (AX)
+	VMOVDQU      Y9, 32(AX)
+	VMOVDQU      Y11, 64(AX)
+	VMOVDQU      Y13, 96(AX)
+	VMOVDQA      256(DX), Y7
+	VMOVDQA      288(DX), Y8
+	VMOVDQA      320(DX), Y9
+	VMOVDQA      352(DX), Y10
+	VMOVDQA      384(DX), Y11
+	VMOVDQA      416(DX), Y12
+	VMOVDQA      448(DX), Y13
+	VMOVDQA      480(DX), Y14
+	VPERM2I128   $0x20, Y8, Y7, Y0
+	VPERM2I128   $0x31, Y8, Y7, Y8
+	VMOVDQA      Y0, Y7
+	VPERM2I128   $0x20, Y10, Y9, Y0
+	VPERM2I128   $0x31, Y10, Y9, Y10
+	VMOVDQA      Y0, Y9
+	VPERM2I128   $0x20, Y12, Y11, Y0
+	VPERM2I128   $0x31, Y12, Y11, Y12
+	VMOVDQA      Y0, Y11
+	VPERM2I128   $0x20, Y14, Y13, Y0
+	VPERM2I128   $0x31, Y14, Y13, Y14
+	VMOVDQA      Y0, Y13
+	VPUNPCKLQDQ  Y8, Y7, Y0
+	VPUNPCKHQDQ  Y8, Y7, Y8
+	VMOVDQA      Y0, Y7
+	VPUNPCKLQDQ  Y10, Y9, Y0
+	VPUNPCKHQDQ  Y10, Y9, Y10
+	VMOVDQA      Y0, Y9
+	VPUNPCKLQDQ  Y12, Y11, Y0
+	VPUNPCKHQDQ  Y12, Y11, Y12
+	VMOVDQA      Y0, Y11
+	VPUNPCKLQDQ  Y14, Y13, Y0
+	VPUNPCKHQDQ  Y14, Y13, Y14
+	VMOVDQA      Y0, Y13
+	VPSLLQ       $0x20, Y8, Y8
+	VPSLLQ       $0x20, Y10, Y10
+	VPSLLQ       $0x20, Y12, Y12
+	VPSLLQ       $0x20, Y14, Y14
+	VPBLENDD     $0xaa, Y8, Y7, Y7
+	VPBLENDD     $0xaa, Y10, Y9, Y9
+	VPBLENDD     $0xaa, Y12, Y11, Y11
+	VPBLENDD     $0xaa, Y14, Y13, Y13
+	VMOVDQU      Y7, 128(AX)
+	VMOVDQU      Y9, 160(AX)
+	VMOVDQU      Y11, 192(AX)
+	VMOVDQU      Y13, 224(AX)
+	VMOVDQA      512(DX), Y7
+	VMOVDQA      544(DX), Y8
+	VMOVDQA      576(DX), Y9
+	VMOVDQA      608(DX), Y10
+	VMOVDQA      640(DX), Y11
+	VMOVDQA      672(DX), Y12
+	VMOVDQA      704(DX), Y13
+	VMOVDQA      736(DX), Y14
+	VPERM2I128   $0x20, Y8, Y7, Y0
+	VPERM2I128   $0x31, Y8, Y7, Y8
+	VMOVDQA      Y0, Y7
+	VPERM2I128   $0x20, Y10, Y9, Y0
+	VPERM2I128   $0x31, Y10, Y9, Y10
+	VMOVDQA      Y0, Y9
+	VPERM2I128   $0x20, Y12, Y11, Y0
+	VPERM2I128   $0x31, Y12, Y11, Y12
+	VMOVDQA      Y0, Y11
+	VPERM2I128   $0x20, Y14, Y13, Y0
+	VPERM2I128   $0x31, Y14, Y13, Y14
+	VMOVDQA      Y0, Y13
+	VPUNPCKLQDQ  Y8, Y7, Y0
+	VPUNPCKHQDQ  Y8, Y7, Y8
+	VMOVDQA      Y0, Y7
+	VPUNPCKLQDQ  Y10, Y9, Y0
+	VPUNPCKHQDQ  Y10, Y9, Y10
+	VMOVDQA      Y0, Y9
+	VPUNPCKLQDQ  Y12, Y11, Y0
+	VPUNPCKHQDQ  Y12, Y11, Y12
+	VMOVDQA      Y0, Y11
+	VPUNPCKLQDQ  Y14, Y13, Y0
+	VPUNPCKHQDQ  Y14, Y13, Y14
+	VMOVDQA      Y0, Y13
+	VPSLLQ       $0x20, Y8, Y8
+	VPSLLQ       $0x20, Y10, Y10
+	VPSLLQ       $0x20, Y12, Y12
+	VPSLLQ       $0x20, Y14, Y14
+	VPBLENDD     $0xaa, Y8, Y7, Y7
+	VPBLENDD     $0xaa, Y10, Y9, Y9
+	VPBLENDD     $0xaa, Y12, Y11, Y11
+	VPBLENDD     $0xaa, Y14, Y13, Y13
+	VMOVDQU      Y7, 256(AX)
+	VMOVDQU      Y9, 288(AX)
+	VMOVDQU      Y11, 320(AX)
+	VMOVDQU      Y13, 352(AX)
+	VMOVDQA      768(DX), Y7
+	VMOVDQA      800(DX), Y8
+	VMOVDQA      832(DX), Y9
+	VMOVDQA      864(DX), Y10
+	VMOVDQA      896(DX), Y11
+	VMOVDQA      928(DX), Y12
+	VMOVDQA      960(DX), Y13
+	VMOVDQA      992(DX), Y14
+	VPERM2I128   $0x20, Y8, Y7, Y0
+	VPERM2I128   $0x31, Y8, Y7, Y8
+	VMOVDQA      Y0, Y7
+	VPERM2I128   $0x20, Y10, Y9, Y0
+	VPERM2I128   $0x31, Y10, Y9, Y10
+	VMOVDQA      Y0, Y9
+	VPERM2I128   $0x20, Y12, Y11, Y0
+	VPERM2I128   $0x31, Y12, Y11, Y12
+	VMOVDQA      Y0, Y11
+	VPERM2I128   $0x20, Y14, Y13, Y0
+	VPERM2I128   $0x31, Y14, Y13, Y14
+	VMOVDQA      Y0, Y13
+	VPUNPCKLQDQ  Y8, Y7, Y0
+	VPUNPCKHQDQ  Y8, Y7, Y8
+	VMOVDQA      Y0, Y7
+	VPUNPCKLQDQ  Y10, Y9, Y0
+	VPUNPCKHQDQ  Y10, Y9, Y10
+	VMOVDQA      Y0, Y9
+	VPUNPCKLQDQ  Y12, Y11, Y0
+	VPUNPCKHQDQ  Y12, Y11, Y12
+	VMOVDQA      Y0, Y11
+	VPUNPCKLQDQ  Y14, Y13, Y0
+	VPUNPCKHQDQ  Y14, Y13, Y14
+	VMOVDQA      Y0, Y13
+	VPSLLQ       $0x20, Y8, Y8
+	VPSLLQ       $0x20, Y10, Y10
+	VPSLLQ       $0x20, Y12, Y12
+	VPSLLQ       $0x20, Y14, Y14
+	VPBLENDD     $0xaa, Y8, Y7, Y7
+	VPBLENDD     $0xaa, Y10, Y9, Y9
+	VPBLENDD     $0xaa, Y12, Y11, Y11
+	VPBLENDD     $0xaa, Y14, Y13, Y13
+	VMOVDQU      Y7, 384(AX)
+	VMOVDQU      Y9, 416(AX)
+	VMOVDQU      Y11, 448(AX)
+	VMOVDQU      Y13, 480(AX)
+	VMOVDQA      1024(DX), Y7
+	VMOVDQA      1056(DX), Y8
+	VMOVDQA      1088(DX), Y9
+	VMOVDQA      1120(DX), Y10
+	VMOVDQA      1152(DX), Y11
+	VMOVDQA      1184(DX), Y12
+	VMOVDQA      1216(DX), Y13
+	VMOVDQA      1248(DX), Y14
+	VPERM2I128   $0x20, Y8, Y7, Y0
+	VPERM2I128   $0x31, Y8, Y7, Y8
+	VMOVDQA      Y0, Y7
+	VPERM2I128   $0x20, Y10, Y9, Y0
+	VPERM2I128   $0x31, Y10, Y9, Y10
+	VMOVDQA      Y0, Y9
+	VPERM2I128   $0x20, Y12, Y11, Y0
+	VPERM2I128   $0x31, Y12, Y11, Y12
+	VMOVDQA      Y0, Y11
+	VPERM2I128   $0x20, Y14, Y13, Y0
+	VPERM2I128   $0x31, Y14, Y13, Y14
+	VMOVDQA      Y0, Y13
+	VPUNPCKLQDQ  Y8, Y7, Y0
+	VPUNPCKHQDQ  Y8, Y7, Y8
+	VMOVDQA      Y0, Y7
+	VPUNPCKLQDQ  Y10, Y9, Y0
+	VPUNPCKHQDQ  Y10, Y9, Y10
+	VMOVDQA      Y0, Y9
+	VPUNPCKLQDQ  Y12, Y11, Y0
+	VPUNPCKHQDQ  Y12, Y11, Y12
+	VMOVDQA      Y0, Y11
+	VPUNPCKLQDQ  Y14, Y13, Y0
+	VPUNPCKHQDQ  Y14, Y13, Y14
+	VMOVDQA      Y0, Y13
+	VPSLLQ       $0x20, Y8, Y8
+	VPSLLQ       $0x20, Y10, Y10
+	VPSLLQ       $0x20, Y12, Y12
+	VPSLLQ       $0x20, Y14, Y14
+	VPBLENDD     $0xaa, Y8, Y7, Y7
+	VPBLENDD     $0xaa, Y10, Y9, Y9
+	VPBLENDD     $0xaa, Y12, Y11, Y11
+	VPBLENDD     $0xaa, Y14, Y13, Y13
+	VMOVDQU      Y7, 512(AX)
+	VMOVDQU      Y9, 544(AX)
+	VMOVDQU      Y11, 576(AX)
+	VMOVDQU      Y13, 608(AX)
+	VMOVDQA      1280(DX), Y7
+	VMOVDQA      1312(DX), Y8
+	VMOVDQA      1344(DX), Y9
+	VMOVDQA      1376(DX), Y10
+	VMOVDQA      1408(DX), Y11
+	VMOVDQA      1440(DX), Y12
+	VMOVDQA      1472(DX), Y13
+	VMOVDQA      1504(DX), Y14
+	VPERM2I128   $0x20, Y8, Y7, Y0
+	VPERM2I128   $0x31, Y8, Y7, Y8
+	VMOVDQA      Y0, Y7
+	VPERM2I128   $0x20, Y10, Y9, Y0
+	VPERM2I128   $0x31, Y10, Y9, Y10
+	VMOVDQA      Y0, Y9
+	VPERM2I128   $0x20, Y12, Y11, Y0
+	VPERM2I128   $0x31, Y12, Y11, Y12
+	VMOVDQA      Y0, Y11
+	VPERM2I128   $0x20, Y14, Y13, Y0
+	VPERM2I128   $0x31, Y14, Y13, Y14
+	VMOVDQA      Y0, Y13
+	VPUNPCKLQDQ  Y8, Y7, Y0
+	VPUNPCKHQDQ  Y8, Y7, Y8
+	VMOVDQA      Y0, Y7
+	VPUNPCKLQDQ  Y10, Y9, Y0
+	VPUNPCKHQDQ  Y10, Y9, Y10
+	VMOVDQA      Y0, Y9
+	VPUNPCKLQDQ  Y12, Y11, Y0
+	VPUNPCKHQDQ  Y12, Y11, Y12
+	VMOVDQA      Y0, Y11
+	VPUNPCKLQDQ  Y14, Y13, Y0
+	VPUNPCKHQDQ  Y14, Y13, Y14
+	VMOVDQA      Y0, Y13
+	VPSLLQ       $0x20, Y8, Y8
+	VPSLLQ       $0x20, Y10, Y10
+	VPSLLQ       $0x20, Y12, Y12
+	VPSLLQ       $0x20, Y14, Y14
+	VPBLENDD     $0xaa, Y8, Y7, Y7
+	VPBLENDD     $0xaa, Y10, Y9, Y9
+	VPBLENDD     $0xaa, Y12, Y11, Y11
+	VPBLENDD     $0xaa, Y14, Y13, Y13
+	VMOVDQU      Y7, 640(AX)
+	VMOVDQU      Y9, 672(AX)
+	VMOVDQU      Y11, 704(AX)
+	VMOVDQU      Y13, 736(AX)
+	VMOVDQA      1536(DX), Y7
+	VMOVDQA      1568(DX), Y8
+	VMOVDQA      1600(DX), Y9
+	VMOVDQA      1632(DX), Y10
+	VMOVDQA      1664(DX), Y11
+	VMOVDQA      1696(DX), Y12
+	VMOVDQA      1728(DX), Y13
+	VMOVDQA      1760(DX), Y14
+	VPERM2I128   $0x20, Y8, Y7, Y0
+	VPERM2I128   $0x31, Y8, Y7, Y8
+	VMOVDQA      Y0, Y7
+	VPERM2I128   $0x20, Y10, Y9, Y0
+	VPERM2I128   $0x31, Y10, Y9, Y10
+	VMOVDQA      Y0, Y9
+	VPERM2I128   $0x20, Y12, Y11, Y0
+	VPERM2I128   $0x31, Y12, Y11, Y12
+	VMOVDQA      Y0, Y11
+	VPERM2I128   $0x20, Y14, Y13, Y0
+	VPERM2I128   $0x31, Y14, Y13, Y14
+	VMOVDQA      Y0, Y13
+	VPUNPCKLQDQ  Y8, Y7, Y0
+	VPUNPCKHQDQ  Y8, Y7, Y8
+	VMOVDQA      Y0, Y7
+	VPUNPCKLQDQ  Y10, Y9, Y0
+	VPUNPCKHQDQ  Y10, Y9, Y10
+	VMOVDQA      Y0, Y9
+	VPUNPCKLQDQ  Y12, Y11, Y0
+	VPUNPCKHQDQ  Y12, Y11, Y12
+	VMOVDQA      Y0, Y11
+	VPUNPCKLQDQ  Y14, Y13, Y0
+	VPUNPCKHQDQ  Y14, Y13, Y14
+	VMOVDQA      Y0, Y13
+	VPSLLQ       $0x20, Y8, Y8
+	VPSLLQ       $0x20, Y10, Y10
+	VPSLLQ       $0x20, Y12, Y12
+	VPSLLQ       $0x20, Y14, Y14
+	VPBLENDD     $0xaa, Y8, Y7, Y7
+	VPBLENDD     $0xaa, Y10, Y9, Y9
+	VPBLENDD     $0xaa, Y12, Y11, Y11
+	VPBLENDD     $0xaa, Y14, Y13, Y13
+	VMOVDQU      Y7, 768(AX)
+	VMOVDQU      Y9, 800(AX)
+	VMOVDQU      Y11, 832(AX)
+	VMOVDQU      Y13, 864(AX)
+	VMOVDQA      1792(DX), Y7
+	VMOVDQA      1824(DX), Y8
+	VMOVDQA      1856(DX), Y9
+	VMOVDQA      1888(DX), Y10
+	VMOVDQA      1920(DX), Y11
+	VMOVDQA      1952(DX), Y12
+	VMOVDQA      1984(DX), Y13
+	VMOVDQA      2016(DX), Y14
+	VPERM2I128   $0x20, Y8, Y7, Y0
+	VPERM2I128   $0x31, Y8, Y7, Y8
+	VMOVDQA      Y0, Y7
+	VPERM2I128   $0x20, Y10, Y9, Y0
+	VPERM2I128   $0x31, Y10, Y9, Y10
+	VMOVDQA      Y0, Y9
+	VPERM2I128   $0x20, Y12, Y11, Y0
+	VPERM2I128   $0x31, Y12, Y11, Y12
+	VMOVDQA      Y0, Y11
+	VPERM2I128   $0x20, Y14, Y13, Y0
+	VPERM2I128   $0x31, Y14, Y13, Y14
+	VMOVDQA      Y0, Y13
+	VPUNPCKLQDQ  Y8, Y7, Y0
+	VPUNPCKHQDQ  Y8, Y7, Y8
+	VMOVDQA      Y0, Y7
+	VPUNPCKLQDQ  Y10, Y9, Y0
+	VPUNPCKHQDQ  Y10, Y9, Y10
+	VMOVDQA      Y0, Y9
+	VPUNPCKLQDQ  Y12, Y11, Y0
+	VPUNPCKHQDQ  Y12, Y11, Y12
+	VMOVDQA      Y0, Y11
+	VPUNPCKLQDQ  Y14, Y13, Y0
+	VPUNPCKHQDQ  Y14, Y13, Y14
+	VMOVDQA      Y0, Y13
+	VPSLLQ       $0x20, Y8, Y8
+	VPSLLQ       $0x20, Y10, Y10
+	VPSLLQ       $0x20, Y12, Y12
+	VPSLLQ       $0x20, Y14, Y14
+	VPBLENDD     $0xaa, Y8, Y7, Y7
+	VPBLENDD     $0xaa, Y10, Y9, Y9
+	VPBLENDD     $0xaa, Y12, Y11, Y11
+	VPBLENDD     $0xaa, Y14, Y13, Y13
+	VMOVDQU      Y7, 896(AX)
+	VMOVDQU      Y9, 928(AX)
+	VMOVDQU      Y11, 960(AX)
+	VMOVDQU      Y13, 992(AX)
+	RET
+
+// func mulHatAVX2(p *[256]uint32, a *[256]uint32, b *[256]uint32)
+// Requires: AVX, AVX2
+TEXT ·mulHatAVX2(SB), NOSPLIT, $0-24
+	MOVQ         p+0(FP), AX
+	MOVQ         a+8(FP), CX
+	MOVQ         b+16(FP), DX
+	MOVL         $0x007fe001, BX
+	VMOVD        BX, X0
+	VPBROADCASTD X0, Y0
+	MOVL         $0xfc7fdfff, BX
+	VMOVD        BX, X1
+	VPBROADCASTD X1, Y1
+	VPMOVZXDQ    (CX), Y2
+	VPMOVZXDQ    16(CX), Y4
+	VPMOVZXDQ    32(CX), Y6
+	VPMOVZXDQ    48(CX), Y8
+	VPMOVZXDQ    (DX), Y3
+	VPMOVZXDQ    16(DX), Y5
+	VPMOVZXDQ    32(DX), Y7
+	VPMOVZXDQ    48(DX), Y9
+	VPMULUDQ     Y2, Y3, Y3
+	VPMULUDQ     Y4, Y5, Y5
+	VPMULUDQ     Y6, Y7, Y7
+	VPMULUDQ     Y8, Y9, Y9
+	VPMULUDQ     Y1, Y3, Y2
+	VPMULUDQ     Y1, Y5, Y4
+	VPMULUDQ     Y1, Y7, Y6
+	VPMULUDQ     Y1, Y9, Y8
+	VPMULUDQ     Y0, Y2, Y2
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y8, Y8
+	VPADDQ       Y2, Y3, Y2
+	VPADDQ       Y4, Y5, Y4
+	VPADDQ       Y6, Y7, Y6
+	VPADDQ       Y8, Y9, Y8
+	VPSRLQ       $0x20, Y2, Y3
+	VPSRLQ       $0x20, Y4, Y5
+	VPSRLQ       $0x20, Y6, Y7
+	VPSRLQ       $0x20, Y8, Y9
+	VPERM2I128   $0x20, Y5, Y3, Y2
+	VPERM2I128   $0x31, Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPERM2I128   $0x20, Y9, Y7, Y2
+	VPERM2I128   $0x31, Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPUNPCKLQDQ  Y5, Y3, Y2
+	VPUNPCKHQDQ  Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPUNPCKLQDQ  Y9, Y7, Y2
+	VPUNPCKHQDQ  Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPSLLQ       $0x20, Y5, Y5
+	VPSLLQ       $0x20, Y9, Y9
+	VPBLENDD     $0xaa, Y5, Y3, Y3
+	VPBLENDD     $0xaa, Y9, Y7, Y7
+	VMOVDQU      Y3, (AX)
+	VMOVDQU      Y7, 32(AX)
+	VPMOVZXDQ    64(CX), Y2
+	VPMOVZXDQ    80(CX), Y4
+	VPMOVZXDQ    96(CX), Y6
+	VPMOVZXDQ    112(CX), Y8
+	VPMOVZXDQ    64(DX), Y3
+	VPMOVZXDQ    80(DX), Y5
+	VPMOVZXDQ    96(DX), Y7
+	VPMOVZXDQ    112(DX), Y9
+	VPMULUDQ     Y2, Y3, Y3
+	VPMULUDQ     Y4, Y5, Y5
+	VPMULUDQ     Y6, Y7, Y7
+	VPMULUDQ     Y8, Y9, Y9
+	VPMULUDQ     Y1, Y3, Y2
+	VPMULUDQ     Y1, Y5, Y4
+	VPMULUDQ     Y1, Y7, Y6
+	VPMULUDQ     Y1, Y9, Y8
+	VPMULUDQ     Y0, Y2, Y2
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y8, Y8
+	VPADDQ       Y2, Y3, Y2
+	VPADDQ       Y4, Y5, Y4
+	VPADDQ       Y6, Y7, Y6
+	VPADDQ       Y8, Y9, Y8
+	VPSRLQ       $0x20, Y2, Y3
+	VPSRLQ       $0x20, Y4, Y5
+	VPSRLQ       $0x20, Y6, Y7
+	VPSRLQ       $0x20, Y8, Y9
+	VPERM2I128   $0x20, Y5, Y3, Y2
+	VPERM2I128   $0x31, Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPERM2I128   $0x20, Y9, Y7, Y2
+	VPERM2I128   $0x31, Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPUNPCKLQDQ  Y5, Y3, Y2
+	VPUNPCKHQDQ  Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPUNPCKLQDQ  Y9, Y7, Y2
+	VPUNPCKHQDQ  Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPSLLQ       $0x20, Y5, Y5
+	VPSLLQ       $0x20, Y9, Y9
+	VPBLENDD     $0xaa, Y5, Y3, Y3
+	VPBLENDD     $0xaa, Y9, Y7, Y7
+	VMOVDQU      Y3, 64(AX)
+	VMOVDQU      Y7, 96(AX)
+	VPMOVZXDQ    128(CX), Y2
+	VPMOVZXDQ    144(CX), Y4
+	VPMOVZXDQ    160(CX), Y6
+	VPMOVZXDQ    176(CX), Y8
+	VPMOVZXDQ    128(DX), Y3
+	VPMOVZXDQ    144(DX), Y5
+	VPMOVZXDQ    160(DX), Y7
+	VPMOVZXDQ    176(DX), Y9
+	VPMULUDQ     Y2, Y3, Y3
+	VPMULUDQ     Y4, Y5, Y5
+	VPMULUDQ     Y6, Y7, Y7
+	VPMULUDQ     Y8, Y9, Y9
+	VPMULUDQ     Y1, Y3, Y2
+	VPMULUDQ     Y1, Y5, Y4
+	VPMULUDQ     Y1, Y7, Y6
+	VPMULUDQ     Y1, Y9, Y8
+	VPMULUDQ     Y0, Y2, Y2
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y8, Y8
+	VPADDQ       Y2, Y3, Y2
+	VPADDQ       Y4, Y5, Y4
+	VPADDQ       Y6, Y7, Y6
+	VPADDQ       Y8, Y9, Y8
+	VPSRLQ       $0x20, Y2, Y3
+	VPSRLQ       $0x20, Y4, Y5
+	VPSRLQ       $0x20, Y6, Y7
+	VPSRLQ       $0x20, Y8, Y9
+	VPERM2I128   $0x20, Y5, Y3, Y2
+	VPERM2I128   $0x31, Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPERM2I128   $0x20, Y9, Y7, Y2
+	VPERM2I128   $0x31, Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPUNPCKLQDQ  Y5, Y3, Y2
+	VPUNPCKHQDQ  Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPUNPCKLQDQ  Y9, Y7, Y2
+	VPUNPCKHQDQ  Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPSLLQ       $0x20, Y5, Y5
+	VPSLLQ       $0x20, Y9, Y9
+	VPBLENDD     $0xaa, Y5, Y3, Y3
+	VPBLENDD     $0xaa, Y9, Y7, Y7
+	VMOVDQU      Y3, 128(AX)
+	VMOVDQU      Y7, 160(AX)
+	VPMOVZXDQ    192(CX), Y2
+	VPMOVZXDQ    208(CX), Y4
+	VPMOVZXDQ    224(CX), Y6
+	VPMOVZXDQ    240(CX), Y8
+	VPMOVZXDQ    192(DX), Y3
+	VPMOVZXDQ    208(DX), Y5
+	VPMOVZXDQ    224(DX), Y7
+	VPMOVZXDQ    240(DX), Y9
+	VPMULUDQ     Y2, Y3, Y3
+	VPMULUDQ     Y4, Y5, Y5
+	VPMULUDQ     Y6, Y7, Y7
+	VPMULUDQ     Y8, Y9, Y9
+	VPMULUDQ     Y1, Y3, Y2
+	VPMULUDQ     Y1, Y5, Y4
+	VPMULUDQ     Y1, Y7, Y6
+	VPMULUDQ     Y1, Y9, Y8
+	VPMULUDQ     Y0, Y2, Y2
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y8, Y8
+	VPADDQ       Y2, Y3, Y2
+	VPADDQ       Y4, Y5, Y4
+	VPADDQ       Y6, Y7, Y6
+	VPADDQ       Y8, Y9, Y8
+	VPSRLQ       $0x20, Y2, Y3
+	VPSRLQ       $0x20, Y4, Y5
+	VPSRLQ       $0x20, Y6, Y7
+	VPSRLQ       $0x20, Y8, Y9
+	VPERM2I128   $0x20, Y5, Y3, Y2
+	VPERM2I128   $0x31, Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPERM2I128   $0x20, Y9, Y7, Y2
+	VPERM2I128   $0x31, Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPUNPCKLQDQ  Y5, Y3, Y2
+	VPUNPCKHQDQ  Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPUNPCKLQDQ  Y9, Y7, Y2
+	VPUNPCKHQDQ  Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPSLLQ       $0x20, Y5, Y5
+	VPSLLQ       $0x20, Y9, Y9
+	VPBLENDD     $0xaa, Y5, Y3, Y3
+	VPBLENDD     $0xaa, Y9, Y7, Y7
+	VMOVDQU      Y3, 192(AX)
+	VMOVDQU      Y7, 224(AX)
+	VPMOVZXDQ    256(CX), Y2
+	VPMOVZXDQ    272(CX), Y4
+	VPMOVZXDQ    288(CX), Y6
+	VPMOVZXDQ    304(CX), Y8
+	VPMOVZXDQ    256(DX), Y3
+	VPMOVZXDQ    272(DX), Y5
+	VPMOVZXDQ    288(DX), Y7
+	VPMOVZXDQ    304(DX), Y9
+	VPMULUDQ     Y2, Y3, Y3
+	VPMULUDQ     Y4, Y5, Y5
+	VPMULUDQ     Y6, Y7, Y7
+	VPMULUDQ     Y8, Y9, Y9
+	VPMULUDQ     Y1, Y3, Y2
+	VPMULUDQ     Y1, Y5, Y4
+	VPMULUDQ     Y1, Y7, Y6
+	VPMULUDQ     Y1, Y9, Y8
+	VPMULUDQ     Y0, Y2, Y2
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y8, Y8
+	VPADDQ       Y2, Y3, Y2
+	VPADDQ       Y4, Y5, Y4
+	VPADDQ       Y6, Y7, Y6
+	VPADDQ       Y8, Y9, Y8
+	VPSRLQ       $0x20, Y2, Y3
+	VPSRLQ       $0x20, Y4, Y5
+	VPSRLQ       $0x20, Y6, Y7
+	VPSRLQ       $0x20, Y8, Y9
+	VPERM2I128   $0x20, Y5, Y3, Y2
+	VPERM2I128   $0x31, Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPERM2I128   $0x20, Y9, Y7, Y2
+	VPERM2I128   $0x31, Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPUNPCKLQDQ  Y5, Y3, Y2
+	VPUNPCKHQDQ  Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPUNPCKLQDQ  Y9, Y7, Y2
+	VPUNPCKHQDQ  Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPSLLQ       $0x20, Y5, Y5
+	VPSLLQ       $0x20, Y9, Y9
+	VPBLENDD     $0xaa, Y5, Y3, Y3
+	VPBLENDD     $0xaa, Y9, Y7, Y7
+	VMOVDQU      Y3, 256(AX)
+	VMOVDQU      Y7, 288(AX)
+	VPMOVZXDQ    320(CX), Y2
+	VPMOVZXDQ    336(CX), Y4
+	VPMOVZXDQ    352(CX), Y6
+	VPMOVZXDQ    368(CX), Y8
+	VPMOVZXDQ    320(DX), Y3
+	VPMOVZXDQ    336(DX), Y5
+	VPMOVZXDQ    352(DX), Y7
+	VPMOVZXDQ    368(DX), Y9
+	VPMULUDQ     Y2, Y3, Y3
+	VPMULUDQ     Y4, Y5, Y5
+	VPMULUDQ     Y6, Y7, Y7
+	VPMULUDQ     Y8, Y9, Y9
+	VPMULUDQ     Y1, Y3, Y2
+	VPMULUDQ     Y1, Y5, Y4
+	VPMULUDQ     Y1, Y7, Y6
+	VPMULUDQ     Y1, Y9, Y8
+	VPMULUDQ     Y0, Y2, Y2
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y8, Y8
+	VPADDQ       Y2, Y3, Y2
+	VPADDQ       Y4, Y5, Y4
+	VPADDQ       Y6, Y7, Y6
+	VPADDQ       Y8, Y9, Y8
+	VPSRLQ       $0x20, Y2, Y3
+	VPSRLQ       $0x20, Y4, Y5
+	VPSRLQ       $0x20, Y6, Y7
+	VPSRLQ       $0x20, Y8, Y9
+	VPERM2I128   $0x20, Y5, Y3, Y2
+	VPERM2I128   $0x31, Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPERM2I128   $0x20, Y9, Y7, Y2
+	VPERM2I128   $0x31, Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPUNPCKLQDQ  Y5, Y3, Y2
+	VPUNPCKHQDQ  Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPUNPCKLQDQ  Y9, Y7, Y2
+	VPUNPCKHQDQ  Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPSLLQ       $0x20, Y5, Y5
+	VPSLLQ       $0x20, Y9, Y9
+	VPBLENDD     $0xaa, Y5, Y3, Y3
+	VPBLENDD     $0xaa, Y9, Y7, Y7
+	VMOVDQU      Y3, 320(AX)
+	VMOVDQU      Y7, 352(AX)
+	VPMOVZXDQ    384(CX), Y2
+	VPMOVZXDQ    400(CX), Y4
+	VPMOVZXDQ    416(CX), Y6
+	VPMOVZXDQ    432(CX), Y8
+	VPMOVZXDQ    384(DX), Y3
+	VPMOVZXDQ    400(DX), Y5
+	VPMOVZXDQ    416(DX), Y7
+	VPMOVZXDQ    432(DX), Y9
+	VPMULUDQ     Y2, Y3, Y3
+	VPMULUDQ     Y4, Y5, Y5
+	VPMULUDQ     Y6, Y7, Y7
+	VPMULUDQ     Y8, Y9, Y9
+	VPMULUDQ     Y1, Y3, Y2
+	VPMULUDQ     Y1, Y5, Y4
+	VPMULUDQ     Y1, Y7, Y6
+	VPMULUDQ     Y1, Y9, Y8
+	VPMULUDQ     Y0, Y2, Y2
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y8, Y8
+	VPADDQ       Y2, Y3, Y2
+	VPADDQ       Y4, Y5, Y4
+	VPADDQ       Y6, Y7, Y6
+	VPADDQ       Y8, Y9, Y8
+	VPSRLQ       $0x20, Y2, Y3
+	VPSRLQ       $0x20, Y4, Y5
+	VPSRLQ       $0x20, Y6, Y7
+	VPSRLQ       $0x20, Y8, Y9
+	VPERM2I128   $0x20, Y5, Y3, Y2
+	VPERM2I128   $0x31, Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPERM2I128   $0x20, Y9, Y7, Y2
+	VPERM2I128   $0x31, Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPUNPCKLQDQ  Y5, Y3, Y2
+	VPUNPCKHQDQ  Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPUNPCKLQDQ  Y9, Y7, Y2
+	VPUNPCKHQDQ  Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPSLLQ       $0x20, Y5, Y5
+	VPSLLQ       $0x20, Y9, Y9
+	VPBLENDD     $0xaa, Y5, Y3, Y3
+	VPBLENDD     $0xaa, Y9, Y7, Y7
+	VMOVDQU      Y3, 384(AX)
+	VMOVDQU      Y7, 416(AX)
+	VPMOVZXDQ    448(CX), Y2
+	VPMOVZXDQ    464(CX), Y4
+	VPMOVZXDQ    480(CX), Y6
+	VPMOVZXDQ    496(CX), Y8
+	VPMOVZXDQ    448(DX), Y3
+	VPMOVZXDQ    464(DX), Y5
+	VPMOVZXDQ    480(DX), Y7
+	VPMOVZXDQ    496(DX), Y9
+	VPMULUDQ     Y2, Y3, Y3
+	VPMULUDQ     Y4, Y5, Y5
+	VPMULUDQ     Y6, Y7, Y7
+	VPMULUDQ     Y8, Y9, Y9
+	VPMULUDQ     Y1, Y3, Y2
+	VPMULUDQ     Y1, Y5, Y4
+	VPMULUDQ     Y1, Y7, Y6
+	VPMULUDQ     Y1, Y9, Y8
+	VPMULUDQ     Y0, Y2, Y2
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y8, Y8
+	VPADDQ       Y2, Y3, Y2
+	VPADDQ       Y4, Y5, Y4
+	VPADDQ       Y6, Y7, Y6
+	VPADDQ       Y8, Y9, Y8
+	VPSRLQ       $0x20, Y2, Y3
+	VPSRLQ       $0x20, Y4, Y5
+	VPSRLQ       $0x20, Y6, Y7
+	VPSRLQ       $0x20, Y8, Y9
+	VPERM2I128   $0x20, Y5, Y3, Y2
+	VPERM2I128   $0x31, Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPERM2I128   $0x20, Y9, Y7, Y2
+	VPERM2I128   $0x31, Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPUNPCKLQDQ  Y5, Y3, Y2
+	VPUNPCKHQDQ  Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPUNPCKLQDQ  Y9, Y7, Y2
+	VPUNPCKHQDQ  Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPSLLQ       $0x20, Y5, Y5
+	VPSLLQ       $0x20, Y9, Y9
+	VPBLENDD     $0xaa, Y5, Y3, Y3
+	VPBLENDD     $0xaa, Y9, Y7, Y7
+	VMOVDQU      Y3, 448(AX)
+	VMOVDQU      Y7, 480(AX)
+	VPMOVZXDQ    512(CX), Y2
+	VPMOVZXDQ    528(CX), Y4
+	VPMOVZXDQ    544(CX), Y6
+	VPMOVZXDQ    560(CX), Y8
+	VPMOVZXDQ    512(DX), Y3
+	VPMOVZXDQ    528(DX), Y5
+	VPMOVZXDQ    544(DX), Y7
+	VPMOVZXDQ    560(DX), Y9
+	VPMULUDQ     Y2, Y3, Y3
+	VPMULUDQ     Y4, Y5, Y5
+	VPMULUDQ     Y6, Y7, Y7
+	VPMULUDQ     Y8, Y9, Y9
+	VPMULUDQ     Y1, Y3, Y2
+	VPMULUDQ     Y1, Y5, Y4
+	VPMULUDQ     Y1, Y7, Y6
+	VPMULUDQ     Y1, Y9, Y8
+	VPMULUDQ     Y0, Y2, Y2
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y8, Y8
+	VPADDQ       Y2, Y3, Y2
+	VPADDQ       Y4, Y5, Y4
+	VPADDQ       Y6, Y7, Y6
+	VPADDQ       Y8, Y9, Y8
+	VPSRLQ       $0x20, Y2, Y3
+	VPSRLQ       $0x20, Y4, Y5
+	VPSRLQ       $0x20, Y6, Y7
+	VPSRLQ       $0x20, Y8, Y9
+	VPERM2I128   $0x20, Y5, Y3, Y2
+	VPERM2I128   $0x31, Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPERM2I128   $0x20, Y9, Y7, Y2
+	VPERM2I128   $0x31, Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPUNPCKLQDQ  Y5, Y3, Y2
+	VPUNPCKHQDQ  Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPUNPCKLQDQ  Y9, Y7, Y2
+	VPUNPCKHQDQ  Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPSLLQ       $0x20, Y5, Y5
+	VPSLLQ       $0x20, Y9, Y9
+	VPBLENDD     $0xaa, Y5, Y3, Y3
+	VPBLENDD     $0xaa, Y9, Y7, Y7
+	VMOVDQU      Y3, 512(AX)
+	VMOVDQU      Y7, 544(AX)
+	VPMOVZXDQ    576(CX), Y2
+	VPMOVZXDQ    592(CX), Y4
+	VPMOVZXDQ    608(CX), Y6
+	VPMOVZXDQ    624(CX), Y8
+	VPMOVZXDQ    576(DX), Y3
+	VPMOVZXDQ    592(DX), Y5
+	VPMOVZXDQ    608(DX), Y7
+	VPMOVZXDQ    624(DX), Y9
+	VPMULUDQ     Y2, Y3, Y3
+	VPMULUDQ     Y4, Y5, Y5
+	VPMULUDQ     Y6, Y7, Y7
+	VPMULUDQ     Y8, Y9, Y9
+	VPMULUDQ     Y1, Y3, Y2
+	VPMULUDQ     Y1, Y5, Y4
+	VPMULUDQ     Y1, Y7, Y6
+	VPMULUDQ     Y1, Y9, Y8
+	VPMULUDQ     Y0, Y2, Y2
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y8, Y8
+	VPADDQ       Y2, Y3, Y2
+	VPADDQ       Y4, Y5, Y4
+	VPADDQ       Y6, Y7, Y6
+	VPADDQ       Y8, Y9, Y8
+	VPSRLQ       $0x20, Y2, Y3
+	VPSRLQ       $0x20, Y4, Y5
+	VPSRLQ       $0x20, Y6, Y7
+	VPSRLQ       $0x20, Y8, Y9
+	VPERM2I128   $0x20, Y5, Y3, Y2
+	VPERM2I128   $0x31, Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPERM2I128   $0x20, Y9, Y7, Y2
+	VPERM2I128   $0x31, Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPUNPCKLQDQ  Y5, Y3, Y2
+	VPUNPCKHQDQ  Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPUNPCKLQDQ  Y9, Y7, Y2
+	VPUNPCKHQDQ  Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPSLLQ       $0x20, Y5, Y5
+	VPSLLQ       $0x20, Y9, Y9
+	VPBLENDD     $0xaa, Y5, Y3, Y3
+	VPBLENDD     $0xaa, Y9, Y7, Y7
+	VMOVDQU      Y3, 576(AX)
+	VMOVDQU      Y7, 608(AX)
+	VPMOVZXDQ    640(CX), Y2
+	VPMOVZXDQ    656(CX), Y4
+	VPMOVZXDQ    672(CX), Y6
+	VPMOVZXDQ    688(CX), Y8
+	VPMOVZXDQ    640(DX), Y3
+	VPMOVZXDQ    656(DX), Y5
+	VPMOVZXDQ    672(DX), Y7
+	VPMOVZXDQ    688(DX), Y9
+	VPMULUDQ     Y2, Y3, Y3
+	VPMULUDQ     Y4, Y5, Y5
+	VPMULUDQ     Y6, Y7, Y7
+	VPMULUDQ     Y8, Y9, Y9
+	VPMULUDQ     Y1, Y3, Y2
+	VPMULUDQ     Y1, Y5, Y4
+	VPMULUDQ     Y1, Y7, Y6
+	VPMULUDQ     Y1, Y9, Y8
+	VPMULUDQ     Y0, Y2, Y2
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y8, Y8
+	VPADDQ       Y2, Y3, Y2
+	VPADDQ       Y4, Y5, Y4
+	VPADDQ       Y6, Y7, Y6
+	VPADDQ       Y8, Y9, Y8
+	VPSRLQ       $0x20, Y2, Y3
+	VPSRLQ       $0x20, Y4, Y5
+	VPSRLQ       $0x20, Y6, Y7
+	VPSRLQ       $0x20, Y8, Y9
+	VPERM2I128   $0x20, Y5, Y3, Y2
+	VPERM2I128   $0x31, Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPERM2I128   $0x20, Y9, Y7, Y2
+	VPERM2I128   $0x31, Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPUNPCKLQDQ  Y5, Y3, Y2
+	VPUNPCKHQDQ  Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPUNPCKLQDQ  Y9, Y7, Y2
+	VPUNPCKHQDQ  Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPSLLQ       $0x20, Y5, Y5
+	VPSLLQ       $0x20, Y9, Y9
+	VPBLENDD     $0xaa, Y5, Y3, Y3
+	VPBLENDD     $0xaa, Y9, Y7, Y7
+	VMOVDQU      Y3, 640(AX)
+	VMOVDQU      Y7, 672(AX)
+	VPMOVZXDQ    704(CX), Y2
+	VPMOVZXDQ    720(CX), Y4
+	VPMOVZXDQ    736(CX), Y6
+	VPMOVZXDQ    752(CX), Y8
+	VPMOVZXDQ    704(DX), Y3
+	VPMOVZXDQ    720(DX), Y5
+	VPMOVZXDQ    736(DX), Y7
+	VPMOVZXDQ    752(DX), Y9
+	VPMULUDQ     Y2, Y3, Y3
+	VPMULUDQ     Y4, Y5, Y5
+	VPMULUDQ     Y6, Y7, Y7
+	VPMULUDQ     Y8, Y9, Y9
+	VPMULUDQ     Y1, Y3, Y2
+	VPMULUDQ     Y1, Y5, Y4
+	VPMULUDQ     Y1, Y7, Y6
+	VPMULUDQ     Y1, Y9, Y8
+	VPMULUDQ     Y0, Y2, Y2
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y8, Y8
+	VPADDQ       Y2, Y3, Y2
+	VPADDQ       Y4, Y5, Y4
+	VPADDQ       Y6, Y7, Y6
+	VPADDQ       Y8, Y9, Y8
+	VPSRLQ       $0x20, Y2, Y3
+	VPSRLQ       $0x20, Y4, Y5
+	VPSRLQ       $0x20, Y6, Y7
+	VPSRLQ       $0x20, Y8, Y9
+	VPERM2I128   $0x20, Y5, Y3, Y2
+	VPERM2I128   $0x31, Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPERM2I128   $0x20, Y9, Y7, Y2
+	VPERM2I128   $0x31, Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPUNPCKLQDQ  Y5, Y3, Y2
+	VPUNPCKHQDQ  Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPUNPCKLQDQ  Y9, Y7, Y2
+	VPUNPCKHQDQ  Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPSLLQ       $0x20, Y5, Y5
+	VPSLLQ       $0x20, Y9, Y9
+	VPBLENDD     $0xaa, Y5, Y3, Y3
+	VPBLENDD     $0xaa, Y9, Y7, Y7
+	VMOVDQU      Y3, 704(AX)
+	VMOVDQU      Y7, 736(AX)
+	VPMOVZXDQ    768(CX), Y2
+	VPMOVZXDQ    784(CX), Y4
+	VPMOVZXDQ    800(CX), Y6
+	VPMOVZXDQ    816(CX), Y8
+	VPMOVZXDQ    768(DX), Y3
+	VPMOVZXDQ    784(DX), Y5
+	VPMOVZXDQ    800(DX), Y7
+	VPMOVZXDQ    816(DX), Y9
+	VPMULUDQ     Y2, Y3, Y3
+	VPMULUDQ     Y4, Y5, Y5
+	VPMULUDQ     Y6, Y7, Y7
+	VPMULUDQ     Y8, Y9, Y9
+	VPMULUDQ     Y1, Y3, Y2
+	VPMULUDQ     Y1, Y5, Y4
+	VPMULUDQ     Y1, Y7, Y6
+	VPMULUDQ     Y1, Y9, Y8
+	VPMULUDQ     Y0, Y2, Y2
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y8, Y8
+	VPADDQ       Y2, Y3, Y2
+	VPADDQ       Y4, Y5, Y4
+	VPADDQ       Y6, Y7, Y6
+	VPADDQ       Y8, Y9, Y8
+	VPSRLQ       $0x20, Y2, Y3
+	VPSRLQ       $0x20, Y4, Y5
+	VPSRLQ       $0x20, Y6, Y7
+	VPSRLQ       $0x20, Y8, Y9
+	VPERM2I128   $0x20, Y5, Y3, Y2
+	VPERM2I128   $0x31, Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPERM2I128   $0x20, Y9, Y7, Y2
+	VPERM2I128   $0x31, Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPUNPCKLQDQ  Y5, Y3, Y2
+	VPUNPCKHQDQ  Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPUNPCKLQDQ  Y9, Y7, Y2
+	VPUNPCKHQDQ  Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPSLLQ       $0x20, Y5, Y5
+	VPSLLQ       $0x20, Y9, Y9
+	VPBLENDD     $0xaa, Y5, Y3, Y3
+	VPBLENDD     $0xaa, Y9, Y7, Y7
+	VMOVDQU      Y3, 768(AX)
+	VMOVDQU      Y7, 800(AX)
+	VPMOVZXDQ    832(CX), Y2
+	VPMOVZXDQ    848(CX), Y4
+	VPMOVZXDQ    864(CX), Y6
+	VPMOVZXDQ    880(CX), Y8
+	VPMOVZXDQ    832(DX), Y3
+	VPMOVZXDQ    848(DX), Y5
+	VPMOVZXDQ    864(DX), Y7
+	VPMOVZXDQ    880(DX), Y9
+	VPMULUDQ     Y2, Y3, Y3
+	VPMULUDQ     Y4, Y5, Y5
+	VPMULUDQ     Y6, Y7, Y7
+	VPMULUDQ     Y8, Y9, Y9
+	VPMULUDQ     Y1, Y3, Y2
+	VPMULUDQ     Y1, Y5, Y4
+	VPMULUDQ     Y1, Y7, Y6
+	VPMULUDQ     Y1, Y9, Y8
+	VPMULUDQ     Y0, Y2, Y2
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y8, Y8
+	VPADDQ       Y2, Y3, Y2
+	VPADDQ       Y4, Y5, Y4
+	VPADDQ       Y6, Y7, Y6
+	VPADDQ       Y8, Y9, Y8
+	VPSRLQ       $0x20, Y2, Y3
+	VPSRLQ       $0x20, Y4, Y5
+	VPSRLQ       $0x20, Y6, Y7
+	VPSRLQ       $0x20, Y8, Y9
+	VPERM2I128   $0x20, Y5, Y3, Y2
+	VPERM2I128   $0x31, Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPERM2I128   $0x20, Y9, Y7, Y2
+	VPERM2I128   $0x31, Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPUNPCKLQDQ  Y5, Y3, Y2
+	VPUNPCKHQDQ  Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPUNPCKLQDQ  Y9, Y7, Y2
+	VPUNPCKHQDQ  Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPSLLQ       $0x20, Y5, Y5
+	VPSLLQ       $0x20, Y9, Y9
+	VPBLENDD     $0xaa, Y5, Y3, Y3
+	VPBLENDD     $0xaa, Y9, Y7, Y7
+	VMOVDQU      Y3, 832(AX)
+	VMOVDQU      Y7, 864(AX)
+	VPMOVZXDQ    896(CX), Y2
+	VPMOVZXDQ    912(CX), Y4
+	VPMOVZXDQ    928(CX), Y6
+	VPMOVZXDQ    944(CX), Y8
+	VPMOVZXDQ    896(DX), Y3
+	VPMOVZXDQ    912(DX), Y5
+	VPMOVZXDQ    928(DX), Y7
+	VPMOVZXDQ    944(DX), Y9
+	VPMULUDQ     Y2, Y3, Y3
+	VPMULUDQ     Y4, Y5, Y5
+	VPMULUDQ     Y6, Y7, Y7
+	VPMULUDQ     Y8, Y9, Y9
+	VPMULUDQ     Y1, Y3, Y2
+	VPMULUDQ     Y1, Y5, Y4
+	VPMULUDQ     Y1, Y7, Y6
+	VPMULUDQ     Y1, Y9, Y8
+	VPMULUDQ     Y0, Y2, Y2
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y8, Y8
+	VPADDQ       Y2, Y3, Y2
+	VPADDQ       Y4, Y5, Y4
+	VPADDQ       Y6, Y7, Y6
+	VPADDQ       Y8, Y9, Y8
+	VPSRLQ       $0x20, Y2, Y3
+	VPSRLQ       $0x20, Y4, Y5
+	VPSRLQ       $0x20, Y6, Y7
+	VPSRLQ       $0x20, Y8, Y9
+	VPERM2I128   $0x20, Y5, Y3, Y2
+	VPERM2I128   $0x31, Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPERM2I128   $0x20, Y9, Y7, Y2
+	VPERM2I128   $0x31, Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPUNPCKLQDQ  Y5, Y3, Y2
+	VPUNPCKHQDQ  Y5, Y3, Y5
+	VMOVDQA      Y2, Y3
+	VPUNPCKLQDQ  Y9, Y7, Y2
+	VPUNPCKHQDQ  Y9, Y7, Y9
+	VMOVDQA      Y2, Y7
+	VPSLLQ       $0x20, Y5, Y5
+	VPSLLQ       $0x20, Y9, Y9
+	VPBLENDD     $0xaa, Y5, Y3, Y3
+	VPBLENDD     $0xaa, Y9, Y7, Y7
+	VMOVDQU      Y3, 896(AX)
+	VMOVDQU      Y7, 928(AX)
+	VPMOVZXDQ    960(CX), Y2
+	VPMOVZXDQ    976(CX), Y4
+	VPMOVZXDQ    992(CX), Y6
+	VPMOVZXDQ    1008(CX), Y8
+	VPMOVZXDQ    960(DX), Y3
+	VPMOVZXDQ    976(DX), Y5
+	VPMOVZXDQ    992(DX), Y7
+	VPMOVZXDQ    1008(DX), Y9
+	VPMULUDQ     Y2, Y3, Y3
+	VPMULUDQ     Y4, Y5, Y5
+	VPMULUDQ     Y6, Y7, Y7
+	VPMULUDQ     Y8, Y9, Y9
+	VPMULUDQ     Y1, Y3, Y2
+	VPMULUDQ     Y1, Y5, Y4
+	VPMULUDQ     Y1, Y7, Y6
+	VPMULUDQ     Y1, Y9, Y8
+	VPMULUDQ     Y0, Y2, Y2
+	VPMULUDQ     Y0, Y4, Y4
+	VPMULUDQ     Y0, Y6, Y6
+	VPMULUDQ     Y0, Y8, Y8
+	VPADDQ       Y2, Y3, Y2
+	VPADDQ       Y4, Y5, Y4
+	VPADDQ       Y6, Y7, Y6
+	VPADDQ       Y8, Y9, Y8
+	VPSRLQ       $0x20, Y2, Y3
+	VPSRLQ       $0x20, Y4, Y5
+	VPSRLQ       $0x20, Y6, Y7
+	VPSRLQ       $0x20, Y8, Y9
+	VPERM2I128   $0x20, Y5, Y3, Y0
+	VPERM2I128   $0x31, Y5, Y3, Y5
+	VMOVDQA      Y0, Y3
+	VPERM2I128   $0x20, Y9, Y7, Y0
+	VPERM2I128   $0x31, Y9, Y7, Y9
+	VMOVDQA      Y0, Y7
+	VPUNPCKLQDQ  Y5, Y3, Y0
+	VPUNPCKHQDQ  Y5, Y3, Y5
+	VMOVDQA      Y0, Y3
+	VPUNPCKLQDQ  Y9, Y7, Y0
+	VPUNPCKHQDQ  Y9, Y7, Y9
+	VMOVDQA      Y0, Y7
+	VPSLLQ       $0x20, Y5, Y5
+	VPSLLQ       $0x20, Y9, Y9
+	VPBLENDD     $0xaa, Y5, Y3, Y3
+	VPBLENDD     $0xaa, Y9, Y7, Y7
+	VMOVDQU      Y3, 960(AX)
+	VMOVDQU      Y7, 992(AX)
+	RET
+
+// func addAVX2(p *[256]uint32, a *[256]uint32, b *[256]uint32)
+// Requires: AVX, AVX2
+TEXT ·addAVX2(SB), NOSPLIT, $0-24
+	MOVQ    p+0(FP), AX
+	MOVQ    a+8(FP), CX
+	MOVQ    b+16(FP), DX
+	VMOVDQU (CX), Y0
+	VMOVDQU 32(CX), Y2
+	VMOVDQU 64(CX), Y4
+	VMOVDQU 96(CX), Y6
+	VMOVDQU 128(CX), Y8
+	VMOVDQU 160(CX), Y10
+	VMOVDQU 192(CX), Y12
+	VMOVDQU 224(CX), Y14
+	VMOVDQU (DX), Y1
+	VMOVDQU 32(DX), Y3
+	VMOVDQU 64(DX), Y5
+	VMOVDQU 96(DX), Y7
+	VMOVDQU 128(DX), Y9
+	VMOVDQU 160(DX), Y11
+	VMOVDQU 192(DX), Y13
+	VMOVDQU 224(DX), Y15
+	VPADDD  Y0, Y1, Y1
+	VPADDD  Y2, Y3, Y3
+	VPADDD  Y4, Y5, Y5
+	VPADDD  Y6, Y7, Y7
+	VPADDD  Y8, Y9, Y9
+	VPADDD  Y10, Y11, Y11
+	VPADDD  Y12, Y13, Y13
+	VPADDD  Y14, Y15, Y15
+	VMOVDQU Y1, (AX)
+	VMOVDQU Y3, 32(AX)
+	VMOVDQU Y5, 64(AX)
+	VMOVDQU Y7, 96(AX)
+	VMOVDQU Y9, 128(AX)
+	VMOVDQU Y11, 160(AX)
+	VMOVDQU Y13, 192(AX)
+	VMOVDQU Y15, 224(AX)
+	VMOVDQU 256(CX), Y0
+	VMOVDQU 288(CX), Y2
+	VMOVDQU 320(CX), Y4
+	VMOVDQU 352(CX), Y6
+	VMOVDQU 384(CX), Y8
+	VMOVDQU 416(CX), Y10
+	VMOVDQU 448(CX), Y12
+	VMOVDQU 480(CX), Y14
+	VMOVDQU 256(DX), Y1
+	VMOVDQU 288(DX), Y3
+	VMOVDQU 320(DX), Y5
+	VMOVDQU 352(DX), Y7
+	VMOVDQU 384(DX), Y9
+	VMOVDQU 416(DX), Y11
+	VMOVDQU 448(DX), Y13
+	VMOVDQU 480(DX), Y15
+	VPADDD  Y0, Y1, Y1
+	VPADDD  Y2, Y3, Y3
+	VPADDD  Y4, Y5, Y5
+	VPADDD  Y6, Y7, Y7
+	VPADDD  Y8, Y9, Y9
+	VPADDD  Y10, Y11, Y11
+	VPADDD  Y12, Y13, Y13
+	VPADDD  Y14, Y15, Y15
+	VMOVDQU Y1, 256(AX)
+	VMOVDQU Y3, 288(AX)
+	VMOVDQU Y5, 320(AX)
+	VMOVDQU Y7, 352(AX)
+	VMOVDQU Y9, 384(AX)
+	VMOVDQU Y11, 416(AX)
+	VMOVDQU Y13, 448(AX)
+	VMOVDQU Y15, 480(AX)
+	VMOVDQU 512(CX), Y0
+	VMOVDQU 544(CX), Y2
+	VMOVDQU 576(CX), Y4
+	VMOVDQU 608(CX), Y6
+	VMOVDQU 640(CX), Y8
+	VMOVDQU 672(CX), Y10
+	VMOVDQU 704(CX), Y12
+	VMOVDQU 736(CX), Y14
+	VMOVDQU 512(DX), Y1
+	VMOVDQU 544(DX), Y3
+	VMOVDQU 576(DX), Y5
+	VMOVDQU 608(DX), Y7
+	VMOVDQU 640(DX), Y9
+	VMOVDQU 672(DX), Y11
+	VMOVDQU 704(DX), Y13
+	VMOVDQU 736(DX), Y15
+	VPADDD  Y0, Y1, Y1
+	VPADDD  Y2, Y3, Y3
+	VPADDD  Y4, Y5, Y5
+	VPADDD  Y6, Y7, Y7
+	VPADDD  Y8, Y9, Y9
+	VPADDD  Y10, Y11, Y11
+	VPADDD  Y12, Y13, Y13
+	VPADDD  Y14, Y15, Y15
+	VMOVDQU Y1, 512(AX)
+	VMOVDQU Y3, 544(AX)
+	VMOVDQU Y5, 576(AX)
+	VMOVDQU Y7, 608(AX)
+	VMOVDQU Y9, 640(AX)
+	VMOVDQU Y11, 672(AX)
+	VMOVDQU Y13, 704(AX)
+	VMOVDQU Y15, 736(AX)
+	VMOVDQU 768(CX), Y0
+	VMOVDQU 800(CX), Y2
+	VMOVDQU 832(CX), Y4
+	VMOVDQU 864(CX), Y6
+	VMOVDQU 896(CX), Y8
+	VMOVDQU 928(CX), Y10
+	VMOVDQU 960(CX), Y12
+	VMOVDQU 992(CX), Y14
+	VMOVDQU 768(DX), Y1
+	VMOVDQU 800(DX), Y3
+	VMOVDQU 832(DX), Y5
+	VMOVDQU 864(DX), Y7
+	VMOVDQU 896(DX), Y9
+	VMOVDQU 928(DX), Y11
+	VMOVDQU 960(DX), Y13
+	VMOVDQU 992(DX), Y15
+	VPADDD  Y0, Y1, Y1
+	VPADDD  Y2, Y3, Y3
+	VPADDD  Y4, Y5, Y5
+	VPADDD  Y6, Y7, Y7
+	VPADDD  Y8, Y9, Y9
+	VPADDD  Y10, Y11, Y11
+	VPADDD  Y12, Y13, Y13
+	VPADDD  Y14, Y15, Y15
+	VMOVDQU Y1, 768(AX)
+	VMOVDQU Y3, 800(AX)
+	VMOVDQU Y5, 832(AX)
+	VMOVDQU Y7, 864(AX)
+	VMOVDQU Y9, 896(AX)
+	VMOVDQU Y11, 928(AX)
+	VMOVDQU Y13, 960(AX)
+	VMOVDQU Y15, 992(AX)
+	RET
+
+// func subAVX2(p *[256]uint32, a *[256]uint32, b *[256]uint32)
+// Requires: AVX, AVX2
+TEXT ·subAVX2(SB), NOSPLIT, $0-24
+	MOVQ         p+0(FP), AX
+	MOVQ         a+8(FP), CX
+	MOVQ         b+16(FP), DX
+	MOVL         $0x00ffc002, BX
+	VMOVD        BX, X0
+	VPBROADCASTD X0, Y8
+	VMOVDQU      (CX), Y0
+	VMOVDQU      32(CX), Y2
+	VMOVDQU      64(CX), Y4
+	VMOVDQU      96(CX), Y6
+	VMOVDQU      (DX), Y1
+	VMOVDQU      32(DX), Y3
+	VMOVDQU      64(DX), Y5
+	VMOVDQU      96(DX), Y7
+	VPSUBD       Y1, Y8, Y1
+	VPSUBD       Y3, Y8, Y3
+	VPSUBD       Y5, Y8, Y5
+	VPSUBD       Y7, Y8, Y7
+	VPADDD       Y0, Y1, Y1
+	VPADDD       Y2, Y3, Y3
+	VPADDD       Y4, Y5, Y5
+	VPADDD       Y6, Y7, Y7
+	VMOVDQU      Y1, (AX)
+	VMOVDQU      Y3, 32(AX)
+	VMOVDQU      Y5, 64(AX)
+	VMOVDQU      Y7, 96(AX)
+	VMOVDQU      128(CX), Y0
+	VMOVDQU      160(CX), Y2
+	VMOVDQU      192(CX), Y4
+	VMOVDQU      224(CX), Y6
+	VMOVDQU      128(DX), Y1
+	VMOVDQU      160(DX), Y3
+	VMOVDQU      192(DX), Y5
+	VMOVDQU      224(DX), Y7
+	VPSUBD       Y1, Y8, Y1
+	VPSUBD       Y3, Y8, Y3
+	VPSUBD       Y5, Y8, Y5
+	VPSUBD       Y7, Y8, Y7
+	VPADDD       Y0, Y1, Y1
+	VPADDD       Y2, Y3, Y3
+	VPADDD       Y4, Y5, Y5
+	VPADDD       Y6, Y7, Y7
+	VMOVDQU      Y1, 128(AX)
+	VMOVDQU      Y3, 160(AX)
+	VMOVDQU      Y5, 192(AX)
+	VMOVDQU      Y7, 224(AX)
+	VMOVDQU      256(CX), Y0
+	VMOVDQU      288(CX), Y2
+	VMOVDQU      320(CX), Y4
+	VMOVDQU      352(CX), Y6
+	VMOVDQU      256(DX), Y1
+	VMOVDQU      288(DX), Y3
+	VMOVDQU      320(DX), Y5
+	VMOVDQU      352(DX), Y7
+	VPSUBD       Y1, Y8, Y1
+	VPSUBD       Y3, Y8, Y3
+	VPSUBD       Y5, Y8, Y5
+	VPSUBD       Y7, Y8, Y7
+	VPADDD       Y0, Y1, Y1
+	VPADDD       Y2, Y3, Y3
+	VPADDD       Y4, Y5, Y5
+	VPADDD       Y6, Y7, Y7
+	VMOVDQU      Y1, 256(AX)
+	VMOVDQU      Y3, 288(AX)
+	VMOVDQU      Y5, 320(AX)
+	VMOVDQU      Y7, 352(AX)
+	VMOVDQU      384(CX), Y0
+	VMOVDQU      416(CX), Y2
+	VMOVDQU      448(CX), Y4
+	VMOVDQU      480(CX), Y6
+	VMOVDQU      384(DX), Y1
+	VMOVDQU      416(DX), Y3
+	VMOVDQU      448(DX), Y5
+	VMOVDQU      480(DX), Y7
+	VPSUBD       Y1, Y8, Y1
+	VPSUBD       Y3, Y8, Y3
+	VPSUBD       Y5, Y8, Y5
+	VPSUBD       Y7, Y8, Y7
+	VPADDD       Y0, Y1, Y1
+	VPADDD       Y2, Y3, Y3
+	VPADDD       Y4, Y5, Y5
+	VPADDD       Y6, Y7, Y7
+	VMOVDQU      Y1, 384(AX)
+	VMOVDQU      Y3, 416(AX)
+	VMOVDQU      Y5, 448(AX)
+	VMOVDQU      Y7, 480(AX)
+	VMOVDQU      512(CX), Y0
+	VMOVDQU      544(CX), Y2
+	VMOVDQU      576(CX), Y4
+	VMOVDQU      608(CX), Y6
+	VMOVDQU      512(DX), Y1
+	VMOVDQU      544(DX), Y3
+	VMOVDQU      576(DX), Y5
+	VMOVDQU      608(DX), Y7
+	VPSUBD       Y1, Y8, Y1
+	VPSUBD       Y3, Y8, Y3
+	VPSUBD       Y5, Y8, Y5
+	VPSUBD       Y7, Y8, Y7
+	VPADDD       Y0, Y1, Y1
+	VPADDD       Y2, Y3, Y3
+	VPADDD       Y4, Y5, Y5
+	VPADDD       Y6, Y7, Y7
+	VMOVDQU      Y1, 512(AX)
+	VMOVDQU      Y3, 544(AX)
+	VMOVDQU      Y5, 576(AX)
+	VMOVDQU      Y7, 608(AX)
+	VMOVDQU      640(CX), Y0
+	VMOVDQU      672(CX), Y2
+	VMOVDQU      704(CX), Y4
+	VMOVDQU      736(CX), Y6
+	VMOVDQU      640(DX), Y1
+	VMOVDQU      672(DX), Y3
+	VMOVDQU      704(DX), Y5
+	VMOVDQU      736(DX), Y7
+	VPSUBD       Y1, Y8, Y1
+	VPSUBD       Y3, Y8, Y3
+	VPSUBD       Y5, Y8, Y5
+	VPSUBD       Y7, Y8, Y7
+	VPADDD       Y0, Y1, Y1
+	VPADDD       Y2, Y3, Y3
+	VPADDD       Y4, Y5, Y5
+	VPADDD       Y6, Y7, Y7
+	VMOVDQU      Y1, 640(AX)
+	VMOVDQU      Y3, 672(AX)
+	VMOVDQU      Y5, 704(AX)
+	VMOVDQU      Y7, 736(AX)
+	VMOVDQU      768(CX), Y0
+	VMOVDQU      800(CX), Y2
+	VMOVDQU      832(CX), Y4
+	VMOVDQU      864(CX), Y6
+	VMOVDQU      768(DX), Y1
+	VMOVDQU      800(DX), Y3
+	VMOVDQU      832(DX), Y5
+	VMOVDQU      864(DX), Y7
+	VPSUBD       Y1, Y8, Y1
+	VPSUBD       Y3, Y8, Y3
+	VPSUBD       Y5, Y8, Y5
+	VPSUBD       Y7, Y8, Y7
+	VPADDD       Y0, Y1, Y1
+	VPADDD       Y2, Y3, Y3
+	VPADDD       Y4, Y5, Y5
+	VPADDD       Y6, Y7, Y7
+	VMOVDQU      Y1, 768(AX)
+	VMOVDQU      Y3, 800(AX)
+	VMOVDQU      Y5, 832(AX)
+	VMOVDQU      Y7, 864(AX)
+	VMOVDQU      896(CX), Y0
+	VMOVDQU      928(CX), Y2
+	VMOVDQU      960(CX), Y4
+	VMOVDQU      992(CX), Y6
+	VMOVDQU      896(DX), Y1
+	VMOVDQU      928(DX), Y3
+	VMOVDQU      960(DX), Y5
+	VMOVDQU      992(DX), Y7
+	VPSUBD       Y1, Y8, Y1
+	VPSUBD       Y3, Y8, Y3
+	VPSUBD       Y5, Y8, Y5
+	VPSUBD       Y7, Y8, Y7
+	VPADDD       Y0, Y1, Y1
+	VPADDD       Y2, Y3, Y3
+	VPADDD       Y4, Y5, Y5
+	VPADDD       Y6, Y7, Y7
+	VMOVDQU      Y1, 896(AX)
+	VMOVDQU      Y3, 928(AX)
+	VMOVDQU      Y5, 960(AX)
+	VMOVDQU      Y7, 992(AX)
+	RET
+
+// func packLe16AVX2(p *[256]uint32, buf *byte)
+// Requires: AVX, AVX2
+TEXT ·packLe16AVX2(SB), NOSPLIT, $0-16
+	MOVQ        p+0(FP), AX
+	MOVQ        buf+8(FP), CX
+	VMOVDQU     (AX), Y0
+	VPUNPCKLDQ  32(AX), Y0, Y0
+	VMOVDQU     (AX), Y2
+	VPUNPCKHDQ  32(AX), Y2, Y2
+	VMOVDQU     64(AX), Y4
+	VPUNPCKLDQ  96(AX), Y4, Y4
+	VMOVDQU     64(AX), Y6
+	VPUNPCKHDQ  96(AX), Y6, Y6
+	VMOVDQU     128(AX), Y8
+	VPUNPCKLDQ  160(AX), Y8, Y8
+	VMOVDQU     128(AX), Y10
+	VPUNPCKHDQ  160(AX), Y10, Y10
+	VMOVDQU     192(AX), Y11
+	VPUNPCKLDQ  224(AX), Y11, Y11
+	VMOVDQU     192(AX), Y13
+	VPUNPCKHDQ  224(AX), Y13, Y13
+	VPUNPCKLQDQ Y4, Y0, Y1
+	VPUNPCKHQDQ Y4, Y0, Y3
+	VPUNPCKLQDQ Y6, Y2, Y5
+	VPUNPCKHQDQ Y6, Y2, Y7
+	VPUNPCKLQDQ Y11, Y8, Y9
+	VPUNPCKHQDQ Y11, Y8, Y11
+	VPUNPCKLQDQ Y13, Y10, Y12
+	VPUNPCKHQDQ Y13, Y10, Y13
+	VPERM2I128  $0x20, Y9, Y1, Y0
+	VPERM2I128  $0x20, Y11, Y3, Y2
+	VPERM2I128  $0x20, Y12, Y5, Y4
+	VPERM2I128  $0x20, Y13, Y7, Y6
+	VPERM2I128  $0x31, Y9, Y1, Y8
+	VPERM2I128  $0x31, Y11, Y3, Y10
+	VPERM2I128  $0x31, Y12, Y5, Y11
+	VPERM2I128  $0x31, Y13, Y7, Y13
+	VPSLLD      $0x04, Y2, Y2
+	VPSLLD      $0x08, Y4, Y4
+	VPSLLD      $0x0c, Y6, Y6
+	VPSLLD      $0x10, Y8, Y8
+	VPSLLD      $0x14, Y10, Y10
+	VPSLLD      $0x18, Y11, Y11
+	VPSLLD      $0x1c, Y13, Y13
+	VPOR        Y0, Y2, Y2
+	VPOR        Y4, Y6, Y6
+	VPOR        Y8, Y10, Y10
+	VPOR        Y11, Y13, Y13
+	VPOR        Y2, Y6, Y6
+	VPOR        Y10, Y13, Y13
+	VPOR        Y6, Y13, Y13
+	VMOVDQU     Y13, (CX)
+	VMOVDQU     256(AX), Y0
+	VPUNPCKLDQ  288(AX), Y0, Y0
+	VMOVDQU     256(AX), Y2
+	VPUNPCKHDQ  288(AX), Y2, Y2
+	VMOVDQU     320(AX), Y4
+	VPUNPCKLDQ  352(AX), Y4, Y4
+	VMOVDQU     320(AX), Y6
+	VPUNPCKHDQ  352(AX), Y6, Y6
+	VMOVDQU     384(AX), Y8
+	VPUNPCKLDQ  416(AX), Y8, Y8
+	VMOVDQU     384(AX), Y10
+	VPUNPCKHDQ  416(AX), Y10, Y10
+	VMOVDQU     448(AX), Y11
+	VPUNPCKLDQ  480(AX), Y11, Y11
+	VMOVDQU     448(AX), Y13
+	VPUNPCKHDQ  480(AX), Y13, Y13
+	VPUNPCKLQDQ Y4, Y0, Y1
+	VPUNPCKHQDQ Y4, Y0, Y3
+	VPUNPCKLQDQ Y6, Y2, Y5
+	VPUNPCKHQDQ Y6, Y2, Y7
+	VPUNPCKLQDQ Y11, Y8, Y9
+	VPUNPCKHQDQ Y11, Y8, Y11
+	VPUNPCKLQDQ Y13, Y10, Y12
+	VPUNPCKHQDQ Y13, Y10, Y13
+	VPERM2I128  $0x20, Y9, Y1, Y0
+	VPERM2I128  $0x20, Y11, Y3, Y2
+	VPERM2I128  $0x20, Y12, Y5, Y4
+	VPERM2I128  $0x20, Y13, Y7, Y6
+	VPERM2I128  $0x31, Y9, Y1, Y8
+	VPERM2I128  $0x31, Y11, Y3, Y10
+	VPERM2I128  $0x31, Y12, Y5, Y11
+	VPERM2I128  $0x31, Y13, Y7, Y13
+	VPSLLD      $0x04, Y2, Y2
+	VPSLLD      $0x08, Y4, Y4
+	VPSLLD      $0x0c, Y6, Y6
+	VPSLLD      $0x10, Y8, Y8
+	VPSLLD      $0x14, Y10, Y10
+	VPSLLD      $0x18, Y11, Y11
+	VPSLLD      $0x1c, Y13, Y13
+	VPOR        Y0, Y2, Y2
+	VPOR        Y4, Y6, Y6
+	VPOR        Y8, Y10, Y10
+	VPOR        Y11, Y13, Y13
+	VPOR        Y2, Y6, Y6
+	VPOR        Y10, Y13, Y13
+	VPOR        Y6, Y13, Y13
+	VMOVDQU     Y13, 32(CX)
+	VMOVDQU     512(AX), Y0
+	VPUNPCKLDQ  544(AX), Y0, Y0
+	VMOVDQU     512(AX), Y2
+	VPUNPCKHDQ  544(AX), Y2, Y2
+	VMOVDQU     576(AX), Y4
+	VPUNPCKLDQ  608(AX), Y4, Y4
+	VMOVDQU     576(AX), Y6
+	VPUNPCKHDQ  608(AX), Y6, Y6
+	VMOVDQU     640(AX), Y8
+	VPUNPCKLDQ  672(AX), Y8, Y8
+	VMOVDQU     640(AX), Y10
+	VPUNPCKHDQ  672(AX), Y10, Y10
+	VMOVDQU     704(AX), Y11
+	VPUNPCKLDQ  736(AX), Y11, Y11
+	VMOVDQU     704(AX), Y13
+	VPUNPCKHDQ  736(AX), Y13, Y13
+	VPUNPCKLQDQ Y4, Y0, Y1
+	VPUNPCKHQDQ Y4, Y0, Y3
+	VPUNPCKLQDQ Y6, Y2, Y5
+	VPUNPCKHQDQ Y6, Y2, Y7
+	VPUNPCKLQDQ Y11, Y8, Y9
+	VPUNPCKHQDQ Y11, Y8, Y11
+	VPUNPCKLQDQ Y13, Y10, Y12
+	VPUNPCKHQDQ Y13, Y10, Y13
+	VPERM2I128  $0x20, Y9, Y1, Y0
+	VPERM2I128  $0x20, Y11, Y3, Y2
+	VPERM2I128  $0x20, Y12, Y5, Y4
+	VPERM2I128  $0x20, Y13, Y7, Y6
+	VPERM2I128  $0x31, Y9, Y1, Y8
+	VPERM2I128  $0x31, Y11, Y3, Y10
+	VPERM2I128  $0x31, Y12, Y5, Y11
+	VPERM2I128  $0x31, Y13, Y7, Y13
+	VPSLLD      $0x04, Y2, Y2
+	VPSLLD      $0x08, Y4, Y4
+	VPSLLD      $0x0c, Y6, Y6
+	VPSLLD      $0x10, Y8, Y8
+	VPSLLD      $0x14, Y10, Y10
+	VPSLLD      $0x18, Y11, Y11
+	VPSLLD      $0x1c, Y13, Y13
+	VPOR        Y0, Y2, Y2
+	VPOR        Y4, Y6, Y6
+	VPOR        Y8, Y10, Y10
+	VPOR        Y11, Y13, Y13
+	VPOR        Y2, Y6, Y6
+	VPOR        Y10, Y13, Y13
+	VPOR        Y6, Y13, Y13
+	VMOVDQU     Y13, 64(CX)
+	VMOVDQU     768(AX), Y0
+	VPUNPCKLDQ  800(AX), Y0, Y0
+	VMOVDQU     768(AX), Y2
+	VPUNPCKHDQ  800(AX), Y2, Y2
+	VMOVDQU     832(AX), Y4
+	VPUNPCKLDQ  864(AX), Y4, Y4
+	VMOVDQU     832(AX), Y6
+	VPUNPCKHDQ  864(AX), Y6, Y6
+	VMOVDQU     896(AX), Y8
+	VPUNPCKLDQ  928(AX), Y8, Y8
+	VMOVDQU     896(AX), Y10
+	VPUNPCKHDQ  928(AX), Y10, Y10
+	VMOVDQU     960(AX), Y11
+	VPUNPCKLDQ  992(AX), Y11, Y11
+	VMOVDQU     960(AX), Y13
+	VPUNPCKHDQ  992(AX), Y13, Y13
+	VPUNPCKLQDQ Y4, Y0, Y1
+	VPUNPCKHQDQ Y4, Y0, Y3
+	VPUNPCKLQDQ Y6, Y2, Y5
+	VPUNPCKHQDQ Y6, Y2, Y7
+	VPUNPCKLQDQ Y11, Y8, Y9
+	VPUNPCKHQDQ Y11, Y8, Y11
+	VPUNPCKLQDQ Y13, Y10, Y12
+	VPUNPCKHQDQ Y13, Y10, Y13
+	VPERM2I128  $0x20, Y9, Y1, Y0
+	VPERM2I128  $0x20, Y11, Y3, Y2
+	VPERM2I128  $0x20, Y12, Y5, Y4
+	VPERM2I128  $0x20, Y13, Y7, Y6
+	VPERM2I128  $0x31, Y9, Y1, Y8
+	VPERM2I128  $0x31, Y11, Y3, Y10
+	VPERM2I128  $0x31, Y12, Y5, Y11
+	VPERM2I128  $0x31, Y13, Y7, Y13
+	VPSLLD      $0x04, Y2, Y2
+	VPSLLD      $0x08, Y4, Y4
+	VPSLLD      $0x0c, Y6, Y6
+	VPSLLD      $0x10, Y8, Y8
+	VPSLLD      $0x14, Y10, Y10
+	VPSLLD      $0x18, Y11, Y11
+	VPSLLD      $0x1c, Y13, Y13
+	VPOR        Y0, Y2, Y2
+	VPOR        Y4, Y6, Y6
+	VPOR        Y8, Y10, Y10
+	VPOR        Y11, Y13, Y13
+	VPOR        Y2, Y6, Y6
+	VPOR        Y10, Y13, Y13
+	VPOR        Y6, Y13, Y13
+	VMOVDQU     Y13, 96(CX)
+	RET
+
+// func reduceLe2QAVX2(p *[256]uint32)
+// Requires: AVX, AVX2
+TEXT ·reduceLe2QAVX2(SB), NOSPLIT, $0-8
+	MOVQ         p+0(FP), AX
+	MOVL         $0x007fffff, CX
+	VMOVD        CX, X0
+	VPBROADCASTD X0, Y12
+	VMOVDQU      (AX), Y0
+	VMOVDQU      32(AX), Y3
+	VMOVDQU      64(AX), Y6
+	VMOVDQU      96(AX), Y9
+	VPSRLD       $0x17, Y0, Y1
+	VPSRLD       $0x17, Y3, Y4
+	VPSRLD       $0x17, Y6, Y7
+	VPSRLD       $0x17, Y9, Y10
+	VPAND        Y0, Y12, Y0
+	VPAND        Y3, Y12, Y3
+	VPAND        Y6, Y12, Y6
+	VPAND        Y9, Y12, Y9
+	VPSLLD       $0x0d, Y1, Y2
+	VPSLLD       $0x0d, Y4, Y5
+	VPSLLD       $0x0d, Y7, Y8
+	VPSLLD       $0x0d, Y10, Y11
+	VPSUBD       Y1, Y2, Y2
+	VPSUBD       Y4, Y5, Y5
+	VPSUBD       Y7, Y8, Y8
+	VPSUBD       Y10, Y11, Y11
+	VPADDD       Y0, Y2, Y0
+	VPADDD       Y3, Y5, Y3
+	VPADDD       Y6, Y8, Y6
+	VPADDD       Y9, Y11, Y9
+	VMOVDQU      Y0, (AX)
+	VMOVDQU      Y3, 32(AX)
+	VMOVDQU      Y6, 64(AX)
+	VMOVDQU      Y9, 96(AX)
+	VMOVDQU      128(AX), Y0
+	VMOVDQU      160(AX), Y3
+	VMOVDQU      192(AX), Y6
+	VMOVDQU      224(AX), Y9
+	VPSRLD       $0x17, Y0, Y1
+	VPSRLD       $0x17, Y3, Y4
+	VPSRLD       $0x17, Y6, Y7
+	VPSRLD       $0x17, Y9, Y10
+	VPAND        Y0, Y12, Y0
+	VPAND        Y3, Y12, Y3
+	VPAND        Y6, Y12, Y6
+	VPAND        Y9, Y12, Y9
+	VPSLLD       $0x0d, Y1, Y2
+	VPSLLD       $0x0d, Y4, Y5
+	VPSLLD       $0x0d, Y7, Y8
+	VPSLLD       $0x0d, Y10, Y11
+	VPSUBD       Y1, Y2, Y2
+	VPSUBD       Y4, Y5, Y5
+	VPSUBD       Y7, Y8, Y8
+	VPSUBD       Y10, Y11, Y11
+	VPADDD       Y0, Y2, Y0
+	VPADDD       Y3, Y5, Y3
+	VPADDD       Y6, Y8, Y6
+	VPADDD       Y9, Y11, Y9
+	VMOVDQU      Y0, 128(AX)
+	VMOVDQU      Y3, 160(AX)
+	VMOVDQU      Y6, 192(AX)
+	VMOVDQU      Y9, 224(AX)
+	VMOVDQU      256(AX), Y0
+	VMOVDQU      288(AX), Y3
+	VMOVDQU      320(AX), Y6
+	VMOVDQU      352(AX), Y9
+	VPSRLD       $0x17, Y0, Y1
+	VPSRLD       $0x17, Y3, Y4
+	VPSRLD       $0x17, Y6, Y7
+	VPSRLD       $0x17, Y9, Y10
+	VPAND        Y0, Y12, Y0
+	VPAND        Y3, Y12, Y3
+	VPAND        Y6, Y12, Y6
+	VPAND        Y9, Y12, Y9
+	VPSLLD       $0x0d, Y1, Y2
+	VPSLLD       $0x0d, Y4, Y5
+	VPSLLD       $0x0d, Y7, Y8
+	VPSLLD       $0x0d, Y10, Y11
+	VPSUBD       Y1, Y2, Y2
+	VPSUBD       Y4, Y5, Y5
+	VPSUBD       Y7, Y8, Y8
+	VPSUBD       Y10, Y11, Y11
+	VPADDD       Y0, Y2, Y0
+	VPADDD       Y3, Y5, Y3
+	VPADDD       Y6, Y8, Y6
+	VPADDD       Y9, Y11, Y9
+	VMOVDQU      Y0, 256(AX)
+	VMOVDQU      Y3, 288(AX)
+	VMOVDQU      Y6, 320(AX)
+	VMOVDQU      Y9, 352(AX)
+	VMOVDQU      384(AX), Y0
+	VMOVDQU      416(AX), Y3
+	VMOVDQU      448(AX), Y6
+	VMOVDQU      480(AX), Y9
+	VPSRLD       $0x17, Y0, Y1
+	VPSRLD       $0x17, Y3, Y4
+	VPSRLD       $0x17, Y6, Y7
+	VPSRLD       $0x17, Y9, Y10
+	VPAND        Y0, Y12, Y0
+	VPAND        Y3, Y12, Y3
+	VPAND        Y6, Y12, Y6
+	VPAND        Y9, Y12, Y9
+	VPSLLD       $0x0d, Y1, Y2
+	VPSLLD       $0x0d, Y4, Y5
+	VPSLLD       $0x0d, Y7, Y8
+	VPSLLD       $0x0d, Y10, Y11
+	VPSUBD       Y1, Y2, Y2
+	VPSUBD       Y4, Y5, Y5
+	VPSUBD       Y7, Y8, Y8
+	VPSUBD       Y10, Y11, Y11
+	VPADDD       Y0, Y2, Y0
+	VPADDD       Y3, Y5, Y3
+	VPADDD       Y6, Y8, Y6
+	VPADDD       Y9, Y11, Y9
+	VMOVDQU      Y0, 384(AX)
+	VMOVDQU      Y3, 416(AX)
+	VMOVDQU      Y6, 448(AX)
+	VMOVDQU      Y9, 480(AX)
+	VMOVDQU      512(AX), Y0
+	VMOVDQU      544(AX), Y3
+	VMOVDQU      576(AX), Y6
+	VMOVDQU      608(AX), Y9
+	VPSRLD       $0x17, Y0, Y1
+	VPSRLD       $0x17, Y3, Y4
+	VPSRLD       $0x17, Y6, Y7
+	VPSRLD       $0x17, Y9, Y10
+	VPAND        Y0, Y12, Y0
+	VPAND        Y3, Y12, Y3
+	VPAND        Y6, Y12, Y6
+	VPAND        Y9, Y12, Y9
+	VPSLLD       $0x0d, Y1, Y2
+	VPSLLD       $0x0d, Y4, Y5
+	VPSLLD       $0x0d, Y7, Y8
+	VPSLLD       $0x0d, Y10, Y11
+	VPSUBD       Y1, Y2, Y2
+	VPSUBD       Y4, Y5, Y5
+	VPSUBD       Y7, Y8, Y8
+	VPSUBD       Y10, Y11, Y11
+	VPADDD       Y0, Y2, Y0
+	VPADDD       Y3, Y5, Y3
+	VPADDD       Y6, Y8, Y6
+	VPADDD       Y9, Y11, Y9
+	VMOVDQU      Y0, 512(AX)
+	VMOVDQU      Y3, 544(AX)
+	VMOVDQU      Y6, 576(AX)
+	VMOVDQU      Y9, 608(AX)
+	VMOVDQU      640(AX), Y0
+	VMOVDQU      672(AX), Y3
+	VMOVDQU      704(AX), Y6
+	VMOVDQU      736(AX), Y9
+	VPSRLD       $0x17, Y0, Y1
+	VPSRLD       $0x17, Y3, Y4
+	VPSRLD       $0x17, Y6, Y7
+	VPSRLD       $0x17, Y9, Y10
+	VPAND        Y0, Y12, Y0
+	VPAND        Y3, Y12, Y3
+	VPAND        Y6, Y12, Y6
+	VPAND        Y9, Y12, Y9
+	VPSLLD       $0x0d, Y1, Y2
+	VPSLLD       $0x0d, Y4, Y5
+	VPSLLD       $0x0d, Y7, Y8
+	VPSLLD       $0x0d, Y10, Y11
+	VPSUBD       Y1, Y2, Y2
+	VPSUBD       Y4, Y5, Y5
+	VPSUBD       Y7, Y8, Y8
+	VPSUBD       Y10, Y11, Y11
+	VPADDD       Y0, Y2, Y0
+	VPADDD       Y3, Y5, Y3
+	VPADDD       Y6, Y8, Y6
+	VPADDD       Y9, Y11, Y9
+	VMOVDQU      Y0, 640(AX)
+	VMOVDQU      Y3, 672(AX)
+	VMOVDQU      Y6, 704(AX)
+	VMOVDQU      Y9, 736(AX)
+	VMOVDQU      768(AX), Y0
+	VMOVDQU      800(AX), Y3
+	VMOVDQU      832(AX), Y6
+	VMOVDQU      864(AX), Y9
+	VPSRLD       $0x17, Y0, Y1
+	VPSRLD       $0x17, Y3, Y4
+	VPSRLD       $0x17, Y6, Y7
+	VPSRLD       $0x17, Y9, Y10
+	VPAND        Y0, Y12, Y0
+	VPAND        Y3, Y12, Y3
+	VPAND        Y6, Y12, Y6
+	VPAND        Y9, Y12, Y9
+	VPSLLD       $0x0d, Y1, Y2
+	VPSLLD       $0x0d, Y4, Y5
+	VPSLLD       $0x0d, Y7, Y8
+	VPSLLD       $0x0d, Y10, Y11
+	VPSUBD       Y1, Y2, Y2
+	VPSUBD       Y4, Y5, Y5
+	VPSUBD       Y7, Y8, Y8
+	VPSUBD       Y10, Y11, Y11
+	VPADDD       Y0, Y2, Y0
+	VPADDD       Y3, Y5, Y3
+	VPADDD       Y6, Y8, Y6
+	VPADDD       Y9, Y11, Y9
+	VMOVDQU      Y0, 768(AX)
+	VMOVDQU      Y3, 800(AX)
+	VMOVDQU      Y6, 832(AX)
+	VMOVDQU      Y9, 864(AX)
+	VMOVDQU      896(AX), Y0
+	VMOVDQU      928(AX), Y3
+	VMOVDQU      960(AX), Y6
+	VMOVDQU      992(AX), Y9
+	VPSRLD       $0x17, Y0, Y1
+	VPSRLD       $0x17, Y3, Y4
+	VPSRLD       $0x17, Y6, Y7
+	VPSRLD       $0x17, Y9, Y10
+	VPAND        Y0, Y12, Y0
+	VPAND        Y3, Y12, Y3
+	VPAND        Y6, Y12, Y6
+	VPAND        Y9, Y12, Y9
+	VPSLLD       $0x0d, Y1, Y2
+	VPSLLD       $0x0d, Y4, Y5
+	VPSLLD       $0x0d, Y7, Y8
+	VPSLLD       $0x0d, Y10, Y11
+	VPSUBD       Y1, Y2, Y2
+	VPSUBD       Y4, Y5, Y5
+	VPSUBD       Y7, Y8, Y8
+	VPSUBD       Y10, Y11, Y11
+	VPADDD       Y0, Y2, Y0
+	VPADDD       Y3, Y5, Y3
+	VPADDD       Y6, Y8, Y6
+	VPADDD       Y9, Y11, Y9
+	VMOVDQU      Y0, 896(AX)
+	VMOVDQU      Y3, 928(AX)
+	VMOVDQU      Y6, 960(AX)
+	VMOVDQU      Y9, 992(AX)
+	RET
+
+// func le2qModQAVX2(p *[256]uint32)
+// Requires: AVX, AVX2
+TEXT ·le2qModQAVX2(SB), NOSPLIT, $0-8
+	MOVQ         p+0(FP), AX
+	MOVL         $0x007fe001, CX
+	VMOVD        CX, X0
+	VPBROADCASTD X0, Y8
+	VMOVDQU      (AX), Y0
+	VMOVDQU      32(AX), Y2
+	VMOVDQU      64(AX), Y4
+	VMOVDQU      96(AX), Y6
+	VPSUBD       Y8, Y0, Y0
+	VPSUBD       Y8, Y2, Y2
+	VPSUBD       Y8, Y4, Y4
+	VPSUBD       Y8, Y6, Y6
+	VPSRAD       $0x1f, Y0, Y1
+	VPSRAD       $0x1f, Y2, Y3
+	VPSRAD       $0x1f, Y4, Y5
+	VPSRAD       $0x1f, Y6, Y7
+	VPAND        Y1, Y8, Y1
+	VPAND        Y3, Y8, Y3
+	VPAND        Y5, Y8, Y5
+	VPAND        Y7, Y8, Y7
+	VPADDD       Y0, Y1, Y0
+	VPADDD       Y2, Y3, Y2
+	VPADDD       Y4, Y5, Y4
+	VPADDD       Y6, Y7, Y6
+	VMOVDQU      Y0, (AX)
+	VMOVDQU      Y2, 32(AX)
+	VMOVDQU      Y4, 64(AX)
+	VMOVDQU      Y6, 96(AX)
+	VMOVDQU      128(AX), Y0
+	VMOVDQU      160(AX), Y2
+	VMOVDQU      192(AX), Y4
+	VMOVDQU      224(AX), Y6
+	VPSUBD       Y8, Y0, Y0
+	VPSUBD       Y8, Y2, Y2
+	VPSUBD       Y8, Y4, Y4
+	VPSUBD       Y8, Y6, Y6
+	VPSRAD       $0x1f, Y0, Y1
+	VPSRAD       $0x1f, Y2, Y3
+	VPSRAD       $0x1f, Y4, Y5
+	VPSRAD       $0x1f, Y6, Y7
+	VPAND        Y1, Y8, Y1
+	VPAND        Y3, Y8, Y3
+	VPAND        Y5, Y8, Y5
+	VPAND        Y7, Y8, Y7
+	VPADDD       Y0, Y1, Y0
+	VPADDD       Y2, Y3, Y2
+	VPADDD       Y4, Y5, Y4
+	VPADDD       Y6, Y7, Y6
+	VMOVDQU      Y0, 128(AX)
+	VMOVDQU      Y2, 160(AX)
+	VMOVDQU      Y4, 192(AX)
+	VMOVDQU      Y6, 224(AX)
+	VMOVDQU      256(AX), Y0
+	VMOVDQU      288(AX), Y2
+	VMOVDQU      320(AX), Y4
+	VMOVDQU      352(AX), Y6
+	VPSUBD       Y8, Y0, Y0
+	VPSUBD       Y8, Y2, Y2
+	VPSUBD       Y8, Y4, Y4
+	VPSUBD       Y8, Y6, Y6
+	VPSRAD       $0x1f, Y0, Y1
+	VPSRAD       $0x1f, Y2, Y3
+	VPSRAD       $0x1f, Y4, Y5
+	VPSRAD       $0x1f, Y6, Y7
+	VPAND        Y1, Y8, Y1
+	VPAND        Y3, Y8, Y3
+	VPAND        Y5, Y8, Y5
+	VPAND        Y7, Y8, Y7
+	VPADDD       Y0, Y1, Y0
+	VPADDD       Y2, Y3, Y2
+	VPADDD       Y4, Y5, Y4
+	VPADDD       Y6, Y7, Y6
+	VMOVDQU      Y0, 256(AX)
+	VMOVDQU      Y2, 288(AX)
+	VMOVDQU      Y4, 320(AX)
+	VMOVDQU      Y6, 352(AX)
+	VMOVDQU      384(AX), Y0
+	VMOVDQU      416(AX), Y2
+	VMOVDQU      448(AX), Y4
+	VMOVDQU      480(AX), Y6
+	VPSUBD       Y8, Y0, Y0
+	VPSUBD       Y8, Y2, Y2
+	VPSUBD       Y8, Y4, Y4
+	VPSUBD       Y8, Y6, Y6
+	VPSRAD       $0x1f, Y0, Y1
+	VPSRAD       $0x1f, Y2, Y3
+	VPSRAD       $0x1f, Y4, Y5
+	VPSRAD       $0x1f, Y6, Y7
+	VPAND        Y1, Y8, Y1
+	VPAND        Y3, Y8, Y3
+	VPAND        Y5, Y8, Y5
+	VPAND        Y7, Y8, Y7
+	VPADDD       Y0, Y1, Y0
+	VPADDD       Y2, Y3, Y2
+	VPADDD       Y4, Y5, Y4
+	VPADDD       Y6, Y7, Y6
+	VMOVDQU      Y0, 384(AX)
+	VMOVDQU      Y2, 416(AX)
+	VMOVDQU      Y4, 448(AX)
+	VMOVDQU      Y6, 480(AX)
+	VMOVDQU      512(AX), Y0
+	VMOVDQU      544(AX), Y2
+	VMOVDQU      576(AX), Y4
+	VMOVDQU      608(AX), Y6
+	VPSUBD       Y8, Y0, Y0
+	VPSUBD       Y8, Y2, Y2
+	VPSUBD       Y8, Y4, Y4
+	VPSUBD       Y8, Y6, Y6
+	VPSRAD       $0x1f, Y0, Y1
+	VPSRAD       $0x1f, Y2, Y3
+	VPSRAD       $0x1f, Y4, Y5
+	VPSRAD       $0x1f, Y6, Y7
+	VPAND        Y1, Y8, Y1
+	VPAND        Y3, Y8, Y3
+	VPAND        Y5, Y8, Y5
+	VPAND        Y7, Y8, Y7
+	VPADDD       Y0, Y1, Y0
+	VPADDD       Y2, Y3, Y2
+	VPADDD       Y4, Y5, Y4
+	VPADDD       Y6, Y7, Y6
+	VMOVDQU      Y0, 512(AX)
+	VMOVDQU      Y2, 544(AX)
+	VMOVDQU      Y4, 576(AX)
+	VMOVDQU      Y6, 608(AX)
+	VMOVDQU      640(AX), Y0
+	VMOVDQU      672(AX), Y2
+	VMOVDQU      704(AX), Y4
+	VMOVDQU      736(AX), Y6
+	VPSUBD       Y8, Y0, Y0
+	VPSUBD       Y8, Y2, Y2
+	VPSUBD       Y8, Y4, Y4
+	VPSUBD       Y8, Y6, Y6
+	VPSRAD       $0x1f, Y0, Y1
+	VPSRAD       $0x1f, Y2, Y3
+	VPSRAD       $0x1f, Y4, Y5
+	VPSRAD       $0x1f, Y6, Y7
+	VPAND        Y1, Y8, Y1
+	VPAND        Y3, Y8, Y3
+	VPAND        Y5, Y8, Y5
+	VPAND        Y7, Y8, Y7
+	VPADDD       Y0, Y1, Y0
+	VPADDD       Y2, Y3, Y2
+	VPADDD       Y4, Y5, Y4
+	VPADDD       Y6, Y7, Y6
+	VMOVDQU      Y0, 640(AX)
+	VMOVDQU      Y2, 672(AX)
+	VMOVDQU      Y4, 704(AX)
+	VMOVDQU      Y6, 736(AX)
+	VMOVDQU      768(AX), Y0
+	VMOVDQU      800(AX), Y2
+	VMOVDQU      832(AX), Y4
+	VMOVDQU      864(AX), Y6
+	VPSUBD       Y8, Y0, Y0
+	VPSUBD       Y8, Y2, Y2
+	VPSUBD       Y8, Y4, Y4
+	VPSUBD       Y8, Y6, Y6
+	VPSRAD       $0x1f, Y0, Y1
+	VPSRAD       $0x1f, Y2, Y3
+	VPSRAD       $0x1f, Y4, Y5
+	VPSRAD       $0x1f, Y6, Y7
+	VPAND        Y1, Y8, Y1
+	VPAND        Y3, Y8, Y3
+	VPAND        Y5, Y8, Y5
+	VPAND        Y7, Y8, Y7
+	VPADDD       Y0, Y1, Y0
+	VPADDD       Y2, Y3, Y2
+	VPADDD       Y4, Y5, Y4
+	VPADDD       Y6, Y7, Y6
+	VMOVDQU      Y0, 768(AX)
+	VMOVDQU      Y2, 800(AX)
+	VMOVDQU      Y4, 832(AX)
+	VMOVDQU      Y6, 864(AX)
+	VMOVDQU      896(AX), Y0
+	VMOVDQU      928(AX), Y2
+	VMOVDQU      960(AX), Y4
+	VMOVDQU      992(AX), Y6
+	VPSUBD       Y8, Y0, Y0
+	VPSUBD       Y8, Y2, Y2
+	VPSUBD       Y8, Y4, Y4
+	VPSUBD       Y8, Y6, Y6
+	VPSRAD       $0x1f, Y0, Y1
+	VPSRAD       $0x1f, Y2, Y3
+	VPSRAD       $0x1f, Y4, Y5
+	VPSRAD       $0x1f, Y6, Y7
+	VPAND        Y1, Y8, Y1
+	VPAND        Y3, Y8, Y3
+	VPAND        Y5, Y8, Y5
+	VPAND        Y7, Y8, Y7
+	VPADDD       Y0, Y1, Y0
+	VPADDD       Y2, Y3, Y2
+	VPADDD       Y4, Y5, Y4
+	VPADDD       Y6, Y7, Y6
+	VMOVDQU      Y0, 896(AX)
+	VMOVDQU      Y2, 928(AX)
+	VMOVDQU      Y4, 960(AX)
+	VMOVDQU      Y6, 992(AX)
+	RET
+
+// func exceedsAVX2(p *[256]uint32, bound uint32) uint8
+// Requires: AVX, AVX2
+TEXT ·exceedsAVX2(SB), NOSPLIT, $0-17
+	MOVQ         p+0(FP), AX
+	MOVL         bound+8(FP), CX
+	VMOVD        CX, X0
+	VPBROADCASTD X0, Y8
+	MOVL         $0x003ff000, CX
+	VMOVD        CX, X0
+	VPBROADCASTD X0, Y9
+	MOVL         $0x80000000, CX
+	VMOVD        CX, X0
+	VPBROADCASTD X0, Y10
+	MOVL         $0x88888888, CX
+	VMOVDQU      (AX), Y0
+	VMOVDQU      32(AX), Y2
+	VMOVDQU      64(AX), Y4
+	VMOVDQU      96(AX), Y6
+	VPSUBD       Y0, Y9, Y0
+	VPSUBD       Y2, Y9, Y2
+	VPSUBD       Y4, Y9, Y4
+	VPSUBD       Y6, Y9, Y6
+	VPSRAD       $0x1f, Y0, Y1
+	VPSRAD       $0x1f, Y2, Y3
+	VPSRAD       $0x1f, Y4, Y5
+	VPSRAD       $0x1f, Y6, Y7
+	VPXOR        Y0, Y1, Y0
+	VPXOR        Y2, Y3, Y2
+	VPXOR        Y4, Y5, Y4
+	VPXOR        Y6, Y7, Y6
+	VPSUBD       Y0, Y9, Y0
+	VPSUBD       Y2, Y9, Y2
+	VPSUBD       Y4, Y9, Y4
+	VPSUBD       Y6, Y9, Y6
+	VPSUBD       Y8, Y0, Y0
+	VPSUBD       Y8, Y2, Y2
+	VPSUBD       Y8, Y4, Y4
+	VPSUBD       Y8, Y6, Y6
+	VPAND        Y0, Y10, Y0
+	VPAND        Y2, Y10, Y2
+	VPAND        Y4, Y10, Y4
+	VPAND        Y6, Y10, Y6
+	VPMOVMSKB    Y0, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VPMOVMSKB    Y2, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VPMOVMSKB    Y4, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VPMOVMSKB    Y6, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VMOVDQU      128(AX), Y0
+	VMOVDQU      160(AX), Y2
+	VMOVDQU      192(AX), Y4
+	VMOVDQU      224(AX), Y6
+	VPSUBD       Y0, Y9, Y0
+	VPSUBD       Y2, Y9, Y2
+	VPSUBD       Y4, Y9, Y4
+	VPSUBD       Y6, Y9, Y6
+	VPSRAD       $0x1f, Y0, Y1
+	VPSRAD       $0x1f, Y2, Y3
+	VPSRAD       $0x1f, Y4, Y5
+	VPSRAD       $0x1f, Y6, Y7
+	VPXOR        Y0, Y1, Y0
+	VPXOR        Y2, Y3, Y2
+	VPXOR        Y4, Y5, Y4
+	VPXOR        Y6, Y7, Y6
+	VPSUBD       Y0, Y9, Y0
+	VPSUBD       Y2, Y9, Y2
+	VPSUBD       Y4, Y9, Y4
+	VPSUBD       Y6, Y9, Y6
+	VPSUBD       Y8, Y0, Y0
+	VPSUBD       Y8, Y2, Y2
+	VPSUBD       Y8, Y4, Y4
+	VPSUBD       Y8, Y6, Y6
+	VPAND        Y0, Y10, Y0
+	VPAND        Y2, Y10, Y2
+	VPAND        Y4, Y10, Y4
+	VPAND        Y6, Y10, Y6
+	VPMOVMSKB    Y0, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VPMOVMSKB    Y2, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VPMOVMSKB    Y4, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VPMOVMSKB    Y6, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VMOVDQU      256(AX), Y0
+	VMOVDQU      288(AX), Y2
+	VMOVDQU      320(AX), Y4
+	VMOVDQU      352(AX), Y6
+	VPSUBD       Y0, Y9, Y0
+	VPSUBD       Y2, Y9, Y2
+	VPSUBD       Y4, Y9, Y4
+	VPSUBD       Y6, Y9, Y6
+	VPSRAD       $0x1f, Y0, Y1
+	VPSRAD       $0x1f, Y2, Y3
+	VPSRAD       $0x1f, Y4, Y5
+	VPSRAD       $0x1f, Y6, Y7
+	VPXOR        Y0, Y1, Y0
+	VPXOR        Y2, Y3, Y2
+	VPXOR        Y4, Y5, Y4
+	VPXOR        Y6, Y7, Y6
+	VPSUBD       Y0, Y9, Y0
+	VPSUBD       Y2, Y9, Y2
+	VPSUBD       Y4, Y9, Y4
+	VPSUBD       Y6, Y9, Y6
+	VPSUBD       Y8, Y0, Y0
+	VPSUBD       Y8, Y2, Y2
+	VPSUBD       Y8, Y4, Y4
+	VPSUBD       Y8, Y6, Y6
+	VPAND        Y0, Y10, Y0
+	VPAND        Y2, Y10, Y2
+	VPAND        Y4, Y10, Y4
+	VPAND        Y6, Y10, Y6
+	VPMOVMSKB    Y0, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VPMOVMSKB    Y2, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VPMOVMSKB    Y4, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VPMOVMSKB    Y6, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VMOVDQU      384(AX), Y0
+	VMOVDQU      416(AX), Y2
+	VMOVDQU      448(AX), Y4
+	VMOVDQU      480(AX), Y6
+	VPSUBD       Y0, Y9, Y0
+	VPSUBD       Y2, Y9, Y2
+	VPSUBD       Y4, Y9, Y4
+	VPSUBD       Y6, Y9, Y6
+	VPSRAD       $0x1f, Y0, Y1
+	VPSRAD       $0x1f, Y2, Y3
+	VPSRAD       $0x1f, Y4, Y5
+	VPSRAD       $0x1f, Y6, Y7
+	VPXOR        Y0, Y1, Y0
+	VPXOR        Y2, Y3, Y2
+	VPXOR        Y4, Y5, Y4
+	VPXOR        Y6, Y7, Y6
+	VPSUBD       Y0, Y9, Y0
+	VPSUBD       Y2, Y9, Y2
+	VPSUBD       Y4, Y9, Y4
+	VPSUBD       Y6, Y9, Y6
+	VPSUBD       Y8, Y0, Y0
+	VPSUBD       Y8, Y2, Y2
+	VPSUBD       Y8, Y4, Y4
+	VPSUBD       Y8, Y6, Y6
+	VPAND        Y0, Y10, Y0
+	VPAND        Y2, Y10, Y2
+	VPAND        Y4, Y10, Y4
+	VPAND        Y6, Y10, Y6
+	VPMOVMSKB    Y0, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VPMOVMSKB    Y2, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VPMOVMSKB    Y4, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VPMOVMSKB    Y6, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VMOVDQU      512(AX), Y0
+	VMOVDQU      544(AX), Y2
+	VMOVDQU      576(AX), Y4
+	VMOVDQU      608(AX), Y6
+	VPSUBD       Y0, Y9, Y0
+	VPSUBD       Y2, Y9, Y2
+	VPSUBD       Y4, Y9, Y4
+	VPSUBD       Y6, Y9, Y6
+	VPSRAD       $0x1f, Y0, Y1
+	VPSRAD       $0x1f, Y2, Y3
+	VPSRAD       $0x1f, Y4, Y5
+	VPSRAD       $0x1f, Y6, Y7
+	VPXOR        Y0, Y1, Y0
+	VPXOR        Y2, Y3, Y2
+	VPXOR        Y4, Y5, Y4
+	VPXOR        Y6, Y7, Y6
+	VPSUBD       Y0, Y9, Y0
+	VPSUBD       Y2, Y9, Y2
+	VPSUBD       Y4, Y9, Y4
+	VPSUBD       Y6, Y9, Y6
+	VPSUBD       Y8, Y0, Y0
+	VPSUBD       Y8, Y2, Y2
+	VPSUBD       Y8, Y4, Y4
+	VPSUBD       Y8, Y6, Y6
+	VPAND        Y0, Y10, Y0
+	VPAND        Y2, Y10, Y2
+	VPAND        Y4, Y10, Y4
+	VPAND        Y6, Y10, Y6
+	VPMOVMSKB    Y0, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VPMOVMSKB    Y2, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VPMOVMSKB    Y4, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VPMOVMSKB    Y6, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VMOVDQU      640(AX), Y0
+	VMOVDQU      672(AX), Y2
+	VMOVDQU      704(AX), Y4
+	VMOVDQU      736(AX), Y6
+	VPSUBD       Y0, Y9, Y0
+	VPSUBD       Y2, Y9, Y2
+	VPSUBD       Y4, Y9, Y4
+	VPSUBD       Y6, Y9, Y6
+	VPSRAD       $0x1f, Y0, Y1
+	VPSRAD       $0x1f, Y2, Y3
+	VPSRAD       $0x1f, Y4, Y5
+	VPSRAD       $0x1f, Y6, Y7
+	VPXOR        Y0, Y1, Y0
+	VPXOR        Y2, Y3, Y2
+	VPXOR        Y4, Y5, Y4
+	VPXOR        Y6, Y7, Y6
+	VPSUBD       Y0, Y9, Y0
+	VPSUBD       Y2, Y9, Y2
+	VPSUBD       Y4, Y9, Y4
+	VPSUBD       Y6, Y9, Y6
+	VPSUBD       Y8, Y0, Y0
+	VPSUBD       Y8, Y2, Y2
+	VPSUBD       Y8, Y4, Y4
+	VPSUBD       Y8, Y6, Y6
+	VPAND        Y0, Y10, Y0
+	VPAND        Y2, Y10, Y2
+	VPAND        Y4, Y10, Y4
+	VPAND        Y6, Y10, Y6
+	VPMOVMSKB    Y0, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VPMOVMSKB    Y2, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VPMOVMSKB    Y4, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VPMOVMSKB    Y6, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VMOVDQU      768(AX), Y0
+	VMOVDQU      800(AX), Y2
+	VMOVDQU      832(AX), Y4
+	VMOVDQU      864(AX), Y6
+	VPSUBD       Y0, Y9, Y0
+	VPSUBD       Y2, Y9, Y2
+	VPSUBD       Y4, Y9, Y4
+	VPSUBD       Y6, Y9, Y6
+	VPSRAD       $0x1f, Y0, Y1
+	VPSRAD       $0x1f, Y2, Y3
+	VPSRAD       $0x1f, Y4, Y5
+	VPSRAD       $0x1f, Y6, Y7
+	VPXOR        Y0, Y1, Y0
+	VPXOR        Y2, Y3, Y2
+	VPXOR        Y4, Y5, Y4
+	VPXOR        Y6, Y7, Y6
+	VPSUBD       Y0, Y9, Y0
+	VPSUBD       Y2, Y9, Y2
+	VPSUBD       Y4, Y9, Y4
+	VPSUBD       Y6, Y9, Y6
+	VPSUBD       Y8, Y0, Y0
+	VPSUBD       Y8, Y2, Y2
+	VPSUBD       Y8, Y4, Y4
+	VPSUBD       Y8, Y6, Y6
+	VPAND        Y0, Y10, Y0
+	VPAND        Y2, Y10, Y2
+	VPAND        Y4, Y10, Y4
+	VPAND        Y6, Y10, Y6
+	VPMOVMSKB    Y0, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VPMOVMSKB    Y2, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VPMOVMSKB    Y4, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VPMOVMSKB    Y6, DX
+	XORL         CX, DX
+	TESTL        DX, DX
+	JNZ          exceeded
+	VMOVDQU      896(AX), Y0
+	VMOVDQU      928(AX), Y2
+	VMOVDQU      960(AX), Y4
+	VMOVDQU      992(AX), Y6
+	VPSUBD       Y0, Y9, Y0
+	VPSUBD       Y2, Y9, Y2
+	VPSUBD       Y4, Y9, Y4
+	VPSUBD       Y6, Y9, Y6
+	VPSRAD       $0x1f, Y0, Y1
+	VPSRAD       $0x1f, Y2, Y3
+	VPSRAD       $0x1f, Y4, Y5
+	VPSRAD       $0x1f, Y6, Y7
+	VPXOR        Y0, Y1, Y0
+	VPXOR        Y2, Y3, Y2
+	VPXOR        Y4, Y5, Y4
+	VPXOR        Y6, Y7, Y6
+	VPSUBD       Y0, Y9, Y0
+	VPSUBD       Y2, Y9, Y2
+	VPSUBD       Y4, Y9, Y4
+	VPSUBD       Y6, Y9, Y6
+	VPSUBD       Y8, Y0, Y0
+	VPSUBD       Y8, Y2, Y2
+	VPSUBD       Y8, Y4, Y4
+	VPSUBD       Y8, Y6, Y6
+	VPAND        Y0, Y10, Y0
+	VPAND        Y2, Y10, Y2
+	VPAND        Y4, Y10, Y4
+	VPAND        Y6, Y10, Y6
+	VPMOVMSKB    Y0, AX
+	XORL         CX, AX
+	TESTL        AX, AX
+	JNZ          exceeded
+	VPMOVMSKB    Y2, AX
+	XORL         CX, AX
+	TESTL        AX, AX
+	JNZ          exceeded
+	VPMOVMSKB    Y4, AX
+	XORL         CX, AX
+	TESTL        AX, AX
+	JNZ          exceeded
+	VPMOVMSKB    Y6, AX
+	XORL         CX, AX
+	TESTL        AX, AX
+	JNZ          exceeded
+	XORB         AL, AL
+	MOVB         AL, ret+16(FP)
+	RET
+
+exceeded:
+	MOVB $0x01, AL
+	MOVB AL, ret+16(FP)
+	RET
+
+// func mulBy2toDAVX2(p *[256]uint32, q *[256]uint32)
+// Requires: AVX, AVX2
+TEXT ·mulBy2toDAVX2(SB), NOSPLIT, $0-16
+	MOVQ    p+0(FP), AX
+	MOVQ    q+8(FP), CX
+	VMOVDQU (CX), Y0
+	VMOVDQU 32(CX), Y1
+	VMOVDQU 64(CX), Y2
+	VMOVDQU 96(CX), Y3
+	VMOVDQU 128(CX), Y4
+	VMOVDQU 160(CX), Y5
+	VMOVDQU 192(CX), Y6
+	VMOVDQU 224(CX), Y7
+	VPSLLD  $0x0d, Y0, Y0
+	VPSLLD  $0x0d, Y1, Y1
+	VPSLLD  $0x0d, Y2, Y2
+	VPSLLD  $0x0d, Y3, Y3
+	VPSLLD  $0x0d, Y4, Y4
+	VPSLLD  $0x0d, Y5, Y5
+	VPSLLD  $0x0d, Y6, Y6
+	VPSLLD  $0x0d, Y7, Y7
+	VMOVDQU Y0, (AX)
+	VMOVDQU Y1, 32(AX)
+	VMOVDQU Y2, 64(AX)
+	VMOVDQU Y3, 96(AX)
+	VMOVDQU Y4, 128(AX)
+	VMOVDQU Y5, 160(AX)
+	VMOVDQU Y6, 192(AX)
+	VMOVDQU Y7, 224(AX)
+	VMOVDQU 256(CX), Y0
+	VMOVDQU 288(CX), Y1
+	VMOVDQU 320(CX), Y2
+	VMOVDQU 352(CX), Y3
+	VMOVDQU 384(CX), Y4
+	VMOVDQU 416(CX), Y5
+	VMOVDQU 448(CX), Y6
+	VMOVDQU 480(CX), Y7
+	VPSLLD  $0x0d, Y0, Y0
+	VPSLLD  $0x0d, Y1, Y1
+	VPSLLD  $0x0d, Y2, Y2
+	VPSLLD  $0x0d, Y3, Y3
+	VPSLLD  $0x0d, Y4, Y4
+	VPSLLD  $0x0d, Y5, Y5
+	VPSLLD  $0x0d, Y6, Y6
+	VPSLLD  $0x0d, Y7, Y7
+	VMOVDQU Y0, 256(AX)
+	VMOVDQU Y1, 288(AX)
+	VMOVDQU Y2, 320(AX)
+	VMOVDQU Y3, 352(AX)
+	VMOVDQU Y4, 384(AX)
+	VMOVDQU Y5, 416(AX)
+	VMOVDQU Y6, 448(AX)
+	VMOVDQU Y7, 480(AX)
+	VMOVDQU 512(CX), Y0
+	VMOVDQU 544(CX), Y1
+	VMOVDQU 576(CX), Y2
+	VMOVDQU 608(CX), Y3
+	VMOVDQU 640(CX), Y4
+	VMOVDQU 672(CX), Y5
+	VMOVDQU 704(CX), Y6
+	VMOVDQU 736(CX), Y7
+	VPSLLD  $0x0d, Y0, Y0
+	VPSLLD  $0x0d, Y1, Y1
+	VPSLLD  $0x0d, Y2, Y2
+	VPSLLD  $0x0d, Y3, Y3
+	VPSLLD  $0x0d, Y4, Y4
+	VPSLLD  $0x0d, Y5, Y5
+	VPSLLD  $0x0d, Y6, Y6
+	VPSLLD  $0x0d, Y7, Y7
+	VMOVDQU Y0, 512(AX)
+	VMOVDQU Y1, 544(AX)
+	VMOVDQU Y2, 576(AX)
+	VMOVDQU Y3, 608(AX)
+	VMOVDQU Y4, 640(AX)
+	VMOVDQU Y5, 672(AX)
+	VMOVDQU Y6, 704(AX)
+	VMOVDQU Y7, 736(AX)
+	VMOVDQU 768(CX), Y0
+	VMOVDQU 800(CX), Y1
+	VMOVDQU 832(CX), Y2
+	VMOVDQU 864(CX), Y3
+	VMOVDQU 896(CX), Y4
+	VMOVDQU 928(CX), Y5
+	VMOVDQU 960(CX), Y6
+	VMOVDQU 992(CX), Y7
+	VPSLLD  $0x0d, Y0, Y0
+	VPSLLD  $0x0d, Y1, Y1
+	VPSLLD  $0x0d, Y2, Y2
+	VPSLLD  $0x0d, Y3, Y3
+	VPSLLD  $0x0d, Y4, Y4
+	VPSLLD  $0x0d, Y5, Y5
+	VPSLLD  $0x0d, Y6, Y6
+	VPSLLD  $0x0d, Y7, Y7
+	VMOVDQU Y0, 768(AX)
+	VMOVDQU Y1, 800(AX)
+	VMOVDQU Y2, 832(AX)
+	VMOVDQU Y3, 864(AX)
+	VMOVDQU Y4, 896(AX)
+	VMOVDQU Y5, 928(AX)
+	VMOVDQU Y6, 960(AX)
+	VMOVDQU Y7, 992(AX)
+	RET
diff --git a/vendor/github.com/cloudflare/circl/sign/internal/dilithium/field.go b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/field.go
new file mode 100644
index 0000000000..c2fc6ca720
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/field.go
@@ -0,0 +1,52 @@
+package dilithium
+
+// Returns a y with y < 2q and y = x mod q.
+// Note that in general *not*: ReduceLe2Q(ReduceLe2Q(x)) == x.
+func ReduceLe2Q(x uint32) uint32 {
+	// Note 2²³ = 2¹³ - 1 mod q. So, writing  x = x₁ 2²³ + x₂ with x₂ < 2²³
+	// and x₁ < 2⁹, we have x = y (mod q) where
+	// y = x₂ + x₁ 2¹³ - x₁ ≤ 2²³ + 2¹³ < 2q.
+	x1 := x >> 23
+	x2 := x & 0x7FFFFF // 2²³-1
+	return x2 + (x1 << 13) - x1
+}
+
+// Returns x mod q.
+func modQ(x uint32) uint32 {
+	return le2qModQ(ReduceLe2Q(x))
+}
+
+// For x R ≤ q 2³², find y ≤ 2q with y = x mod q.
+func montReduceLe2Q(x uint64) uint32 {
+	// Qinv = 4236238847 = -(q⁻¹) mod 2³²
+	m := (x * Qinv) & 0xffffffff
+	return uint32((x + m*uint64(Q)) >> 32)
+}
+
+// Returns x mod q for 0 ≤ x < 2q.
+func le2qModQ(x uint32) uint32 {
+	x -= Q
+	mask := uint32(int32(x) >> 31) // mask is 2³²-1 if x was neg.; 0 otherwise
+	return x + (mask & Q)
+}
+
+// Splits 0 ≤ a < Q into a0 and a1 with a = a1*2ᴰ + a0
+// and -2ᴰ⁻¹ < a0 < 2ᴰ⁻¹.  Returns a0 + Q and a1.
+func power2round(a uint32) (a0plusQ, a1 uint32) {
+	// We effectively compute a0 = a mod± 2ᵈ
+	//                    and a1 = (a - a0) / 2ᵈ.
+	a0 := a & ((1 << D) - 1) // a mod 2ᵈ
+
+	// a0 is one of  0, 1, ..., 2ᵈ⁻¹-1, 2ᵈ⁻¹, 2ᵈ⁻¹+1, ..., 2ᵈ-1
+	a0 -= (1 << (D - 1)) + 1
+	// now a0 is     -2ᵈ⁻¹-1, -2ᵈ⁻¹, ..., -2, -1, 0, ..., 2ᵈ⁻¹-2
+	// Next, we add 2ᴰ to those a0 that are negative (seen as int32).
+	a0 += uint32(int32(a0)>>31) & (1 << D)
+	// now a0 is     2ᵈ⁻¹-1, 2ᵈ⁻¹, ..., 2ᵈ-2, 2ᵈ-1, 0, ..., 2ᵈ⁻¹-2
+	a0 -= (1 << (D - 1)) - 1
+	// now a0 id     0, 1, 2, ..., 2ᵈ⁻¹-1, 2ᵈ⁻¹-1, -2ᵈ⁻¹-1, ...
+	// which is what we want.
+	a0plusQ = Q + a0
+	a1 = (a - a0) >> D
+	return
+}
diff --git a/vendor/github.com/cloudflare/circl/sign/internal/dilithium/generic.go b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/generic.go
new file mode 100644
index 0000000000..25321f5d55
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/generic.go
@@ -0,0 +1,81 @@
+//go:build !amd64 || purego
+// +build !amd64 purego
+
+package dilithium
+
+// Execute an in-place forward NTT on as.
+//
+// Assumes the coefficients are in Montgomery representation and bounded
+// by 2*Q.  The resulting coefficients are again in Montgomery representation,
+// but are only bounded bt 18*Q.
+func (p *Poly) NTT() {
+	p.nttGeneric()
+}
+
+// Execute an in-place inverse NTT and multiply by Montgomery factor R
+//
+// Assumes the coefficients are in Montgomery representation and bounded
+// by 2*Q.  The resulting coefficients are again in Montgomery representation
+// and bounded by 2*Q.
+func (p *Poly) InvNTT() {
+	p.invNttGeneric()
+}
+
+// Sets p to the polynomial whose coefficients are the pointwise multiplication
+// of those of a and b.  The coefficients of p are bounded by 2q.
+//
+// Assumes a and b are in Montgomery form and that the pointwise product
+// of each coefficient is below 2³² q.
+func (p *Poly) MulHat(a, b *Poly) {
+	p.mulHatGeneric(a, b)
+}
+
+// Sets p to a + b.  Does not normalize polynomials.
+func (p *Poly) Add(a, b *Poly) {
+	p.addGeneric(a, b)
+}
+
+// Sets p to a - b.
+//
+// Warning: assumes coefficients of b are less than 2q.
+// Sets p to a + b.  Does not normalize polynomials.
+func (p *Poly) Sub(a, b *Poly) {
+	p.subGeneric(a, b)
+}
+
+// Writes p whose coefficients are in [0, 16) to buf, which must be of
+// length N/2.
+func (p *Poly) PackLe16(buf []byte) {
+	p.packLe16Generic(buf)
+}
+
+// Reduces each of the coefficients to <2q.
+func (p *Poly) ReduceLe2Q() {
+	p.reduceLe2QGeneric()
+}
+
+// Reduce each of the coefficients to <q.
+func (p *Poly) Normalize() {
+	p.normalizeGeneric()
+}
+
+// Normalize the coefficients in this polynomial assuming they are already
+// bounded by 2q.
+func (p *Poly) NormalizeAssumingLe2Q() {
+	p.normalizeAssumingLe2QGeneric()
+}
+
+// Checks whether the "supnorm" (see sec 2.1 of the spec) of p is equal
+// or greater than the given bound.
+//
+// Requires the coefficients of p to be normalized.
+func (p *Poly) Exceeds(bound uint32) bool {
+	return p.exceedsGeneric(bound)
+}
+
+// Sets p to 2ᵈ q without reducing.
+//
+// So it requires the coefficients of p  to be less than 2³²⁻ᴰ.
+func (p *Poly) MulBy2toD(q *Poly) {
+	p.mulBy2toDGeneric(q)
+}
diff --git a/vendor/github.com/cloudflare/circl/sign/internal/dilithium/ntt.go b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/ntt.go
new file mode 100644
index 0000000000..1568ccb4b8
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/ntt.go
@@ -0,0 +1,217 @@
+package dilithium
+
+// Zetas lists precomputed powers of the root of unity in Montgomery
+// representation used for the NTT:
+//
+//	Zetas[i] = zetaᵇʳᵛ⁽ⁱ⁾ R mod q,
+//
+// where zeta = 1753, brv(i) is the bitreversal of a 8-bit number
+// and R=2³² mod q.
+//
+// The following Python code generates the Zetas (and InvZetas) lists:
+//
+//	q = 2**23 - 2**13 + 1; zeta = 1753
+//	R = 2**32 % q # Montgomery const.
+//	def brv(x): return int(''.join(reversed(bin(x)[2:].zfill(8))),2)
+//	def inv(x): return pow(x, q-2, q) # inverse in F(q)
+//	print([(pow(zeta, brv(i), q)*R)%q for i in range(256)])
+//	print([(pow(inv(zeta), -(brv(255-i)-256), q)*R)%q for i in range(256)])
+var Zetas = [N]uint32{
+	4193792, 25847, 5771523, 7861508, 237124, 7602457, 7504169,
+	466468, 1826347, 2353451, 8021166, 6288512, 3119733, 5495562,
+	3111497, 2680103, 2725464, 1024112, 7300517, 3585928, 7830929,
+	7260833, 2619752, 6271868, 6262231, 4520680, 6980856, 5102745,
+	1757237, 8360995, 4010497, 280005, 2706023, 95776, 3077325,
+	3530437, 6718724, 4788269, 5842901, 3915439, 4519302, 5336701,
+	3574422, 5512770, 3539968, 8079950, 2348700, 7841118, 6681150,
+	6736599, 3505694, 4558682, 3507263, 6239768, 6779997, 3699596,
+	811944, 531354, 954230, 3881043, 3900724, 5823537, 2071892,
+	5582638, 4450022, 6851714, 4702672, 5339162, 6927966, 3475950,
+	2176455, 6795196, 7122806, 1939314, 4296819, 7380215, 5190273,
+	5223087, 4747489, 126922, 3412210, 7396998, 2147896, 2715295,
+	5412772, 4686924, 7969390, 5903370, 7709315, 7151892, 8357436,
+	7072248, 7998430, 1349076, 1852771, 6949987, 5037034, 264944,
+	508951, 3097992, 44288, 7280319, 904516, 3958618, 4656075,
+	8371839, 1653064, 5130689, 2389356, 8169440, 759969, 7063561,
+	189548, 4827145, 3159746, 6529015, 5971092, 8202977, 1315589,
+	1341330, 1285669, 6795489, 7567685, 6940675, 5361315, 4499357,
+	4751448, 3839961, 2091667, 3407706, 2316500, 3817976, 5037939,
+	2244091, 5933984, 4817955, 266997, 2434439, 7144689, 3513181,
+	4860065, 4621053, 7183191, 5187039, 900702, 1859098, 909542,
+	819034, 495491, 6767243, 8337157, 7857917, 7725090, 5257975,
+	2031748, 3207046, 4823422, 7855319, 7611795, 4784579, 342297,
+	286988, 5942594, 4108315, 3437287, 5038140, 1735879, 203044,
+	2842341, 2691481, 5790267, 1265009, 4055324, 1247620, 2486353,
+	1595974, 4613401, 1250494, 2635921, 4832145, 5386378, 1869119,
+	1903435, 7329447, 7047359, 1237275, 5062207, 6950192, 7929317,
+	1312455, 3306115, 6417775, 7100756, 1917081, 5834105, 7005614,
+	1500165, 777191, 2235880, 3406031, 7838005, 5548557, 6709241,
+	6533464, 5796124, 4656147, 594136, 4603424, 6366809, 2432395,
+	2454455, 8215696, 1957272, 3369112, 185531, 7173032, 5196991,
+	162844, 1616392, 3014001, 810149, 1652634, 4686184, 6581310,
+	5341501, 3523897, 3866901, 269760, 2213111, 7404533, 1717735,
+	472078, 7953734, 1723600, 6577327, 1910376, 6712985, 7276084,
+	8119771, 4546524, 5441381, 6144432, 7959518, 6094090, 183443,
+	7403526, 1612842, 4834730, 7826001, 3919660, 8332111, 7018208,
+	3937738, 1400424, 7534263, 1976782,
+}
+
+// InvZetas lists precomputed powers of the inverse root of unity in Montgomery
+// representation used for the inverse NTT:
+//
+//	InvZetas[i] = zetaᵇʳᵛ⁽²⁵⁵⁻ⁱ⁾⁻²⁵⁶ R mod q,
+//
+// where zeta = 1753, brv(i) is the bitreversal of a 8-bit number
+// and R=2³² mod q.
+var InvZetas = [N]uint32{
+	6403635, 846154, 6979993, 4442679, 1362209, 48306, 4460757,
+	554416, 3545687, 6767575, 976891, 8196974, 2286327, 420899,
+	2235985, 2939036, 3833893, 260646, 1104333, 1667432, 6470041,
+	1803090, 6656817, 426683, 7908339, 6662682, 975884, 6167306,
+	8110657, 4513516, 4856520, 3038916, 1799107, 3694233, 6727783,
+	7570268, 5366416, 6764025, 8217573, 3183426, 1207385, 8194886,
+	5011305, 6423145, 164721, 5925962, 5948022, 2013608, 3776993,
+	7786281, 3724270, 2584293, 1846953, 1671176, 2831860, 542412,
+	4974386, 6144537, 7603226, 6880252, 1374803, 2546312, 6463336,
+	1279661, 1962642, 5074302, 7067962, 451100, 1430225, 3318210,
+	7143142, 1333058, 1050970, 6476982, 6511298, 2994039, 3548272,
+	5744496, 7129923, 3767016, 6784443, 5894064, 7132797, 4325093,
+	7115408, 2590150, 5688936, 5538076, 8177373, 6644538, 3342277,
+	4943130, 4272102, 2437823, 8093429, 8038120, 3595838, 768622,
+	525098, 3556995, 5173371, 6348669, 3122442, 655327, 522500,
+	43260, 1613174, 7884926, 7561383, 7470875, 6521319, 7479715,
+	3193378, 1197226, 3759364, 3520352, 4867236, 1235728, 5945978,
+	8113420, 3562462, 2446433, 6136326, 3342478, 4562441, 6063917,
+	4972711, 6288750, 4540456, 3628969, 3881060, 3019102, 1439742,
+	812732, 1584928, 7094748, 7039087, 7064828, 177440, 2409325,
+	1851402, 5220671, 3553272, 8190869, 1316856, 7620448, 210977,
+	5991061, 3249728, 6727353, 8578, 3724342, 4421799, 7475901,
+	1100098, 8336129, 5282425, 7871466, 8115473, 3343383, 1430430,
+	6527646, 7031341, 381987, 1308169, 22981, 1228525, 671102,
+	2477047, 411027, 3693493, 2967645, 5665122, 6232521, 983419,
+	4968207, 8253495, 3632928, 3157330, 3190144, 1000202, 4083598,
+	6441103, 1257611, 1585221, 6203962, 4904467, 1452451, 3041255,
+	3677745, 1528703, 3930395, 2797779, 6308525, 2556880, 4479693,
+	4499374, 7426187, 7849063, 7568473, 4680821, 1600420, 2140649,
+	4873154, 3821735, 4874723, 1643818, 1699267, 539299, 6031717,
+	300467, 4840449, 2867647, 4805995, 3043716, 3861115, 4464978,
+	2537516, 3592148, 1661693, 4849980, 5303092, 8284641, 5674394,
+	8100412, 4369920, 19422, 6623180, 3277672, 1399561, 3859737,
+	2118186, 2108549, 5760665, 1119584, 549488, 4794489, 1079900,
+	7356305, 5654953, 5700314, 5268920, 2884855, 5260684, 2091905,
+	359251, 6026966, 6554070, 7913949, 876248, 777960, 8143293,
+	518909, 2608894, 8354570, 4186625,
+}
+
+// Execute an in-place forward NTT on as.
+//
+// Assumes the coefficients are in Montgomery representation and bounded
+// by 2*Q.  The resulting coefficients are again in Montgomery representation,
+// but are only bounded bt 18*Q.
+func (p *Poly) nttGeneric() {
+	// Writing z := zeta for our root of unity zeta := 1753, note z²⁵⁶=-1
+	// (otherwise the order of z wouldn't be 512) and so
+	//
+	//  x²⁵⁶ + 1 = x²⁵⁶ - z²⁵⁶
+	//           = (x¹²⁸ - z¹²⁸)(x¹²⁸ + z¹²⁸)
+	//           = (x⁶⁴ - z⁶⁴)(x⁶⁴ + z⁶⁴)(x⁶⁴ + z¹⁹²)(x⁶⁴ - z¹⁹²)
+	//          ...
+	//           = (x-z)(x+z)(x - z¹²⁹)(x + z¹²⁹) ... (x - z²⁵⁵)(x + z²⁵⁵)
+	//
+	// Note that the powers of z that appear (from the second line) are
+	//  in binary
+	//
+	//  01000000 11000000
+	//  00100000 10100000 01100000 11100000
+	//  00010000 10010000 01010000 11010000 00110000 10110000 01110000 11110000
+	//     ...
+	//
+	// i.e. brv(2), brv(3), brv(4), ... and these powers of z are given by
+	// the Zetas array.
+	//
+	// The polynomials x ± zⁱ are irreducible and coprime, hence by the
+	// Chinese Remainder Theorem we know
+	//
+	//  R[x]/(x²⁵⁶+1) → R[x] / (x-z) x ... x R[x] / (x+z²⁵⁵)
+	//                      ~= ∏_i R
+	//
+	// given by
+	//
+	//  a ↦ ( a mod x-z, ..., a mod x+z²⁵⁵ )
+	//    ~ ( a(z), a(-z), a(z¹²⁹), a(-z¹²⁹), ..., a(z²⁵⁵), a(-z²⁵⁵) )
+	//
+	// is an isomorphism, which is the forward NTT.  It can be computed
+	// efficiently by computing
+	//
+	//  a ↦ ( a mod x¹²⁸ - z¹²⁸, a mod x¹²⁸ + z¹²⁸ )
+	//    ↦ ( a mod x⁶⁴ - z⁶⁴,  a mod x⁶⁴ + z⁶⁴,
+	//        a mod x⁶⁴ - z¹⁹², a mod x⁶⁴ + z¹⁹² )
+	//       et cetera
+	//
+	// If N was 8 then this can be pictured in the following diagram:
+	//
+	//  https://cnx.org/resources/17ee4dfe517a6adda05377b25a00bf6e6c93c334/File0026.png
+	//
+	// Each cross is a Cooley--Tukey butterfly: it's the map
+	//
+	//      (a, b) ↦ (a + ζ, a - ζ)
+	//
+	// for the appropriate ζ for that column and row group.
+
+	k := 0 // Index into Zetas
+
+	// l runs effectively over the columns in the diagram above; it is
+	// half the height of a row group, i.e. the number of butterflies in
+	// each row group.  In the diagram above it would be 4, 2, 1.
+	for l := uint(N / 2); l > 0; l >>= 1 {
+		// On the n-th iteration of the l-loop, the coefficients start off
+		// bounded by n*2*Q.
+		//
+		// offset effectively loops over the row groups in this column; it
+		// is the first row in the row group.
+		for offset := uint(0); offset < N-l; offset += 2 * l {
+			k++
+			zeta := uint64(Zetas[k])
+
+			// j loops over each butterfly in the row group.
+			for j := offset; j < offset+l; j++ {
+				t := montReduceLe2Q(zeta * uint64(p[j+l]))
+				p[j+l] = p[j] + (2*Q - t) // Cooley--Tukey butterfly
+				p[j] += t
+			}
+		}
+	}
+}
+
+// Execute an in-place inverse NTT and multiply by Montgomery factor R
+//
+// Assumes the coefficients are in Montgomery representation and bounded
+// by 2*Q.  The resulting coefficients are again in Montgomery representation
+// and bounded by 2*Q.
+func (p *Poly) invNttGeneric() {
+	k := 0 // Index into InvZetas
+
+	// We basically do the opposite of NTT, but postpone dividing by 2 in the
+	// inverse of the Cooley--Tukey butterfly and accumulate that to a big
+	// division by 2⁸ at the end.  See comments in the NTT() function.
+
+	for l := uint(1); l < N; l <<= 1 {
+		// On the n-th iteration of the l-loop, the coefficients start off
+		// bounded by 2ⁿ⁻¹*2*Q, so by 256*Q on the last.
+		for offset := uint(0); offset < N-l; offset += 2 * l {
+			zeta := uint64(InvZetas[k])
+			k++
+			for j := offset; j < offset+l; j++ {
+				t := p[j] // Gentleman--Sande butterfly
+				p[j] = t + p[j+l]
+				t += 256*Q - p[j+l]
+				p[j+l] = montReduceLe2Q(zeta * uint64(t))
+			}
+		}
+	}
+
+	for j := uint(0); j < N; j++ {
+		// ROver256 = 41978 = (256)⁻¹ R²
+		p[j] = montReduceLe2Q(ROver256 * uint64(p[j]))
+	}
+}
diff --git a/vendor/github.com/cloudflare/circl/sign/internal/dilithium/pack.go b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/pack.go
new file mode 100644
index 0000000000..4b952a004f
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/pack.go
@@ -0,0 +1,160 @@
+package dilithium
+
+// Sets p to the polynomial whose coefficients are less than 1024 encoded
+// into buf (which must be of size PolyT1Size).
+//
+// p will be normalized.
+func (p *Poly) UnpackT1(buf []byte) {
+	j := 0
+	for i := 0; i < PolyT1Size; i += 5 {
+		p[j] = (uint32(buf[i]) | (uint32(buf[i+1]) << 8)) & 0x3ff
+		p[j+1] = (uint32(buf[i+1]>>2) | (uint32(buf[i+2]) << 6)) & 0x3ff
+		p[j+2] = (uint32(buf[i+2]>>4) | (uint32(buf[i+3]) << 4)) & 0x3ff
+		p[j+3] = (uint32(buf[i+3]>>6) | (uint32(buf[i+4]) << 2)) & 0x3ff
+		j += 4
+	}
+}
+
+// Writes p whose coefficients are in (-2ᵈ⁻¹, 2ᵈ⁻¹] into buf which
+// has to be of length at least PolyT0Size.
+//
+// Assumes that the coefficients are not normalized, but lie in the
+// range (q-2ᵈ⁻¹, q+2ᵈ⁻¹].
+func (p *Poly) PackT0(buf []byte) {
+	j := 0
+	for i := 0; i < PolyT0Size; i += 13 {
+		p0 := Q + (1 << (D - 1)) - p[j]
+		p1 := Q + (1 << (D - 1)) - p[j+1]
+		p2 := Q + (1 << (D - 1)) - p[j+2]
+		p3 := Q + (1 << (D - 1)) - p[j+3]
+		p4 := Q + (1 << (D - 1)) - p[j+4]
+		p5 := Q + (1 << (D - 1)) - p[j+5]
+		p6 := Q + (1 << (D - 1)) - p[j+6]
+		p7 := Q + (1 << (D - 1)) - p[j+7]
+
+		buf[i] = byte(p0 >> 0)
+		buf[i+1] = byte(p0>>8) | byte(p1<<5)
+		buf[i+2] = byte(p1 >> 3)
+		buf[i+3] = byte(p1>>11) | byte(p2<<2)
+		buf[i+4] = byte(p2>>6) | byte(p3<<7)
+		buf[i+5] = byte(p3 >> 1)
+		buf[i+6] = byte(p3>>9) | byte(p4<<4)
+		buf[i+7] = byte(p4 >> 4)
+		buf[i+8] = byte(p4>>12) | byte(p5<<1)
+		buf[i+9] = byte(p5>>7) | byte(p6<<6)
+		buf[i+10] = byte(p6 >> 2)
+		buf[i+11] = byte(p6>>10) | byte(p7<<3)
+		buf[i+12] = byte(p7 >> 5)
+		j += 8
+	}
+}
+
+// Sets p to the polynomial packed into buf by PackT0.
+//
+// The coefficients of p will not be normalized, but will lie
+// in (-2ᵈ⁻¹, 2ᵈ⁻¹].
+func (p *Poly) UnpackT0(buf []byte) {
+	j := 0
+	for i := 0; i < PolyT0Size; i += 13 {
+		p[j] = Q + (1 << (D - 1)) - ((uint32(buf[i]) |
+			(uint32(buf[i+1]) << 8)) & 0x1fff)
+		p[j+1] = Q + (1 << (D - 1)) - (((uint32(buf[i+1]) >> 5) |
+			(uint32(buf[i+2]) << 3) |
+			(uint32(buf[i+3]) << 11)) & 0x1fff)
+		p[j+2] = Q + (1 << (D - 1)) - (((uint32(buf[i+3]) >> 2) |
+			(uint32(buf[i+4]) << 6)) & 0x1fff)
+		p[j+3] = Q + (1 << (D - 1)) - (((uint32(buf[i+4]) >> 7) |
+			(uint32(buf[i+5]) << 1) |
+			(uint32(buf[i+6]) << 9)) & 0x1fff)
+		p[j+4] = Q + (1 << (D - 1)) - (((uint32(buf[i+6]) >> 4) |
+			(uint32(buf[i+7]) << 4) |
+			(uint32(buf[i+8]) << 12)) & 0x1fff)
+		p[j+5] = Q + (1 << (D - 1)) - (((uint32(buf[i+8]) >> 1) |
+			(uint32(buf[i+9]) << 7)) & 0x1fff)
+		p[j+6] = Q + (1 << (D - 1)) - (((uint32(buf[i+9]) >> 6) |
+			(uint32(buf[i+10]) << 2) |
+			(uint32(buf[i+11]) << 10)) & 0x1fff)
+		p[j+7] = Q + (1 << (D - 1)) - ((uint32(buf[i+11]) >> 3) |
+			(uint32(buf[i+12]) << 5))
+
+		j += 8
+	}
+}
+
+// Writes p whose coefficients are less than 1024 into buf, which must be
+// of size at least PolyT1Size .
+//
+// Assumes coefficients of p are normalized.
+func (p *Poly) PackT1(buf []byte) {
+	j := 0
+	for i := 0; i < PolyT1Size; i += 5 {
+		buf[i] = byte(p[j])
+		buf[i+1] = byte(p[j]>>8) | byte(p[j+1]<<2)
+		buf[i+2] = byte(p[j+1]>>6) | byte(p[j+2]<<4)
+		buf[i+3] = byte(p[j+2]>>4) | byte(p[j+3]<<6)
+		buf[i+4] = byte(p[j+3] >> 2)
+		j += 4
+	}
+}
+
+// Writes p whose coefficients are in [0, 16) to buf, which must be of
+// length N/2.
+func (p *Poly) packLe16Generic(buf []byte) {
+	j := 0
+	for i := 0; i < PolyLe16Size; i++ {
+		buf[i] = byte(p[j]) | byte(p[j+1]<<4)
+		j += 2
+	}
+}
+
+// Writes p with 60 non-zero coefficients {-1,1} to buf, which must have
+// length 40.
+func (p *Poly) PackB60(buf []byte) {
+	// We start with a mask of the non-zero positions of p (which is 32 bytes)
+	// and then append 60 packed bits, where a one indicates a negative
+	// coefficients.
+	var signs uint64
+	mask := uint64(1)
+	for i := 0; i < 32; i++ {
+		buf[i] = 0
+		for j := 0; j < 8; j++ {
+			if p[8*i+j] != 0 {
+				buf[i] |= 1 << uint(j)
+				if p[8*i+j] == Q-1 {
+					signs |= mask
+				}
+				mask <<= 1
+			}
+		}
+	}
+	for i := uint64(0); i < 8; i++ {
+		buf[i+32] = uint8(signs >> (8 * i))
+	}
+}
+
+// UnpackB60 sets p to the polynomial packed into buf with Poly.PackB60().
+//
+// Returns whether unpacking was successful.
+func (p *Poly) UnpackB60(buf []byte) bool {
+	*p = Poly{} // zero p
+	signs := (uint64(buf[32]) | (uint64(buf[33]) << 8) |
+		(uint64(buf[34]) << 16) | (uint64(buf[35]) << 24) |
+		(uint64(buf[36]) << 32) | (uint64(buf[37]) << 40) |
+		(uint64(buf[38]) << 48) | (uint64(buf[39]) << 56))
+	if signs>>60 != 0 {
+		return false // ensure unused bits are zero for strong unforgeability
+	}
+
+	for i := 0; i < 32; i++ {
+		for j := 0; j < 8; j++ {
+			if (buf[i]>>uint(j))&1 == 1 {
+				p[8*i+j] = 1
+				// Note 1 ^ (1 | (Q-1)) = Q-1 and (-1)&x = x
+				p[8*i+j] ^= uint32(-(signs & 1)) & (1 | (Q - 1))
+				signs >>= 1
+			}
+		}
+	}
+
+	return true
+}
diff --git a/vendor/github.com/cloudflare/circl/sign/internal/dilithium/params.go b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/params.go
new file mode 100644
index 0000000000..f423217f02
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/params.go
@@ -0,0 +1,18 @@
+package dilithium
+
+import (
+	"github.com/cloudflare/circl/sign/internal/dilithium/params"
+)
+
+const (
+	SeedSize     = params.SeedSize
+	N            = params.N
+	Q            = params.Q
+	QBits        = params.QBits
+	Qinv         = params.Qinv
+	ROver256     = params.ROver256
+	D            = params.D
+	PolyT1Size   = params.PolyT1Size
+	PolyT0Size   = params.PolyT0Size
+	PolyLe16Size = params.PolyLe16Size
+)
diff --git a/vendor/github.com/cloudflare/circl/sign/internal/dilithium/params/params.go b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/params/params.go
new file mode 100644
index 0000000000..2df20e3a40
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/params/params.go
@@ -0,0 +1,25 @@
+package params
+
+// We put these parameters in a separate package so that the Go code,
+// such as ntt_amd64_src.go, that generates assembler can import it.
+
+const (
+	SeedSize = 32
+	N        = 256
+	Q        = 8380417 // 2²³ - 2¹³ + 1
+	QBits    = 23
+	Qinv     = 4236238847 // = -(q^-1) mod 2³²
+	ROver256 = 41978      // = (256)⁻¹ R² mod q, where R=2³²
+	D        = 13
+
+	// Size of T1 packed.  (Note that the formula is not valid in general,
+	// but it is for the parameters used in the modes of Dilithium.)
+	PolyT1Size = (N * (QBits - D)) / 8
+
+	// Size of T0 packed.  (Note that the formula is not valid in general,
+	// but it is for the parameters used in the modes of Dilithium.)
+	PolyT0Size = (N * D) / 8
+
+	// Size of a packed polynomial whose coefficients are in [0,16).
+	PolyLe16Size = N / 2
+)
diff --git a/vendor/github.com/cloudflare/circl/sign/internal/dilithium/poly.go b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/poly.go
new file mode 100644
index 0000000000..96c0551b38
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/poly.go
@@ -0,0 +1,101 @@
+package dilithium
+
+// An element of our base ring R which are polynomials over Z_q modulo
+// the equation Xᴺ = -1, where q=2²³ - 2¹³ + 1 and N=256.
+//
+// Coefficients aren't always reduced.  See Normalize().
+type Poly [N]uint32
+
+// Reduces each of the coefficients to <2q.
+func (p *Poly) reduceLe2QGeneric() {
+	for i := uint(0); i < N; i++ {
+		p[i] = ReduceLe2Q(p[i])
+	}
+}
+
+// Reduce each of the coefficients to <q.
+func (p *Poly) normalizeGeneric() {
+	for i := uint(0); i < N; i++ {
+		p[i] = modQ(p[i])
+	}
+}
+
+// Normalize the coefficients in this polynomial assuming they are already
+// bounded by 2q.
+func (p *Poly) normalizeAssumingLe2QGeneric() {
+	for i := 0; i < N; i++ {
+		p[i] = le2qModQ(p[i])
+	}
+}
+
+// Sets p to a + b.  Does not normalize polynomials.
+func (p *Poly) addGeneric(a, b *Poly) {
+	for i := uint(0); i < N; i++ {
+		p[i] = a[i] + b[i]
+	}
+}
+
+// Sets p to a - b.
+//
+// Warning: assumes coefficients of b are less than 2q.
+func (p *Poly) subGeneric(a, b *Poly) {
+	for i := uint(0); i < N; i++ {
+		p[i] = a[i] + (2*Q - b[i])
+	}
+}
+
+// Checks whether the "supnorm" (see sec 2.1 of the spec) of p is equal
+// or greater than the given bound.
+//
+// Requires the coefficients of p to be normalized.
+func (p *Poly) exceedsGeneric(bound uint32) bool {
+	// Note that we are allowed to leak which coefficients break the bound,
+	// but not their sign.
+	for i := 0; i < N; i++ {
+		// The central. reps. of {0,       1, ..., (Q-1)/2,  (Q+1)/2, ..., Q-1}
+		// are given by          {0,       1, ..., (Q-1)/2, -(Q-1)/2, ...,  -1}
+		// so their norms are    {0,       1, ..., (Q-1)/2,  (Q-1)/2, ...,   1}.
+		// We'll compute them in a different way though.
+
+		// Sets x to             {(Q-1)/2, (Q-3)/2, ..., 0, -1, ..., -(Q-1)/2}
+		x := int32((Q-1)/2) - int32(p[i])
+		// Sets x to             {(Q-1)/2, (Q-3)/2, ..., 0, 0, ...,  (Q-3)/2}
+		x ^= (x >> 31)
+		// Sets x to             {0,       1, ...,  (Q-1)/2, (Q-1)/2, ..., 1}
+		x = int32((Q-1)/2) - x
+		if uint32(x) >= bound {
+			return true
+		}
+	}
+	return false
+}
+
+// Splits p into p1 and p0 such that [i]p1 * 2ᴰ + [i]p0 = [i]p
+// with -2ᴰ⁻¹ < [i]p0 ≤ 2ᴰ⁻¹.  Returns p0 + Q and p1.
+//
+// Requires the coefficients of p to be normalized.
+func (p *Poly) Power2Round(p0PlusQ, p1 *Poly) {
+	for i := 0; i < N; i++ {
+		p0PlusQ[i], p1[i] = power2round(p[i])
+	}
+}
+
+// Sets p to the polynomial whose coefficients are the pointwise multiplication
+// of those of a and b.  The coefficients of p are bounded by 2q.
+//
+// Assumes a and b are in Montgomery form and that the pointwise product
+// of each coefficient is below 2³² q.
+func (p *Poly) mulHatGeneric(a, b *Poly) {
+	for i := 0; i < N; i++ {
+		p[i] = montReduceLe2Q(uint64(a[i]) * uint64(b[i]))
+	}
+}
+
+// Sets p to 2ᵈ q without reducing.
+//
+// So it requires the coefficients of p  to be less than 2³²⁻ᴰ.
+func (p *Poly) mulBy2toDGeneric(q *Poly) {
+	for i := 0; i < N; i++ {
+		p[i] = q[i] << D
+	}
+}
diff --git a/vendor/github.com/cloudflare/circl/sign/internal/dilithium/stubs_amd64.go b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/stubs_amd64.go
new file mode 100644
index 0000000000..ca92f15ef1
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/sign/internal/dilithium/stubs_amd64.go
@@ -0,0 +1,35 @@
+// Code generated by command: go run src.go -out ../amd64.s -stubs ../stubs_amd64.go -pkg dilithium. DO NOT EDIT.
+
+//go:build amd64 && !purego
+
+package dilithium
+
+//go:noescape
+func nttAVX2(p *[256]uint32)
+
+//go:noescape
+func invNttAVX2(p *[256]uint32)
+
+//go:noescape
+func mulHatAVX2(p *[256]uint32, a *[256]uint32, b *[256]uint32)
+
+//go:noescape
+func addAVX2(p *[256]uint32, a *[256]uint32, b *[256]uint32)
+
+//go:noescape
+func subAVX2(p *[256]uint32, a *[256]uint32, b *[256]uint32)
+
+//go:noescape
+func packLe16AVX2(p *[256]uint32, buf *byte)
+
+//go:noescape
+func reduceLe2QAVX2(p *[256]uint32)
+
+//go:noescape
+func le2qModQAVX2(p *[256]uint32)
+
+//go:noescape
+func exceedsAVX2(p *[256]uint32, bound uint32) uint8
+
+//go:noescape
+func mulBy2toDAVX2(p *[256]uint32, q *[256]uint32)
diff --git a/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/dilithium.go b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/dilithium.go
new file mode 100644
index 0000000000..23a7b9a1f8
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/dilithium.go
@@ -0,0 +1,361 @@
+// Code generated from pkg.templ.go. DO NOT EDIT.
+
+// mldsa65 implements NIST signature scheme ML-DSA-65 as defined in FIPS204.
+package mldsa65
+
+import (
+	"crypto"
+	cryptoRand "crypto/rand"
+	"encoding/asn1"
+	"errors"
+	"io"
+
+	"github.com/cloudflare/circl/sign"
+	common "github.com/cloudflare/circl/sign/internal/dilithium"
+	"github.com/cloudflare/circl/sign/mldsa/mldsa65/internal"
+)
+
+const (
+	// Size of seed for NewKeyFromSeed
+	SeedSize = common.SeedSize
+
+	// Size of a packed PublicKey
+	PublicKeySize = internal.PublicKeySize
+
+	// Size of a packed PrivateKey
+	PrivateKeySize = internal.PrivateKeySize
+
+	// Size of a signature
+	SignatureSize = internal.SignatureSize
+)
+
+// PublicKey is the type of ML-DSA-65 public key
+type PublicKey internal.PublicKey
+
+// PrivateKey is the type of ML-DSA-65 private key
+type PrivateKey internal.PrivateKey
+
+// GenerateKey generates a public/private key pair using entropy from rand.
+// If rand is nil, crypto/rand.Reader will be used.
+func GenerateKey(rand io.Reader) (*PublicKey, *PrivateKey, error) {
+	pk, sk, err := internal.GenerateKey(rand)
+	return (*PublicKey)(pk), (*PrivateKey)(sk), err
+}
+
+// NewKeyFromSeed derives a public/private key pair using the given seed.
+func NewKeyFromSeed(seed *[SeedSize]byte) (*PublicKey, *PrivateKey) {
+	pk, sk := internal.NewKeyFromSeed(seed)
+	return (*PublicKey)(pk), (*PrivateKey)(sk)
+}
+
+// SignTo signs the given message and writes the signature into signature.
+// It will panic if signature is not of length at least SignatureSize.
+//
+// ctx is the optional context string. Errors if ctx is larger than 255 bytes.
+// A nil context string is equivalent to an empty context string.
+func SignTo(sk *PrivateKey, msg, ctx []byte, randomized bool, sig []byte) error {
+	var rnd [32]byte
+	if randomized {
+		_, err := cryptoRand.Read(rnd[:])
+		if err != nil {
+			return err
+		}
+	}
+
+	if len(ctx) > 255 {
+		return sign.ErrContextTooLong
+	}
+
+	internal.SignTo(
+		(*internal.PrivateKey)(sk),
+		func(w io.Writer) {
+			_, _ = w.Write([]byte{0})
+			_, _ = w.Write([]byte{byte(len(ctx))})
+
+			if ctx != nil {
+				_, _ = w.Write(ctx)
+			}
+			w.Write(msg)
+		},
+		rnd,
+		sig,
+	)
+	return nil
+}
+
+// Do not use. Implements ML-DSA.Sign_internal used for compatibility tests.
+func (sk *PrivateKey) unsafeSignInternal(msg []byte, rnd [32]byte) []byte {
+	var ret [SignatureSize]byte
+	internal.SignTo(
+		(*internal.PrivateKey)(sk),
+		func(w io.Writer) {
+			_, _ = w.Write(msg)
+		},
+		rnd,
+		ret[:],
+	)
+	return ret[:]
+}
+
+// Do not use. Implements ML-DSA.Verify_internal used for compatibility tests.
+func unsafeVerifyInternal(pk *PublicKey, msg, sig []byte) bool {
+	return internal.Verify(
+		(*internal.PublicKey)(pk),
+		func(w io.Writer) {
+			_, _ = w.Write(msg)
+		},
+		sig,
+	)
+}
+
+// Verify checks whether the given signature by pk on msg is valid.
+//
+// ctx is the optional context string. Fails if ctx is larger than 255 bytes.
+// A nil context string is equivalent to an empty context string.
+func Verify(pk *PublicKey, msg, ctx, sig []byte) bool {
+	if len(ctx) > 255 {
+		return false
+	}
+	return internal.Verify(
+		(*internal.PublicKey)(pk),
+		func(w io.Writer) {
+			_, _ = w.Write([]byte{0})
+			_, _ = w.Write([]byte{byte(len(ctx))})
+
+			if ctx != nil {
+				_, _ = w.Write(ctx)
+			}
+			_, _ = w.Write(msg)
+		},
+		sig,
+	)
+}
+
+// Sets pk to the public key encoded in buf.
+func (pk *PublicKey) Unpack(buf *[PublicKeySize]byte) {
+	(*internal.PublicKey)(pk).Unpack(buf)
+}
+
+// Sets sk to the private key encoded in buf.
+func (sk *PrivateKey) Unpack(buf *[PrivateKeySize]byte) {
+	(*internal.PrivateKey)(sk).Unpack(buf)
+}
+
+// Packs the public key into buf.
+func (pk *PublicKey) Pack(buf *[PublicKeySize]byte) {
+	(*internal.PublicKey)(pk).Pack(buf)
+}
+
+// Packs the private key into buf.
+func (sk *PrivateKey) Pack(buf *[PrivateKeySize]byte) {
+	(*internal.PrivateKey)(sk).Pack(buf)
+}
+
+// Packs the public key.
+func (pk *PublicKey) Bytes() []byte {
+	var buf [PublicKeySize]byte
+	pk.Pack(&buf)
+	return buf[:]
+}
+
+// Packs the private key.
+func (sk *PrivateKey) Bytes() []byte {
+	var buf [PrivateKeySize]byte
+	sk.Pack(&buf)
+	return buf[:]
+}
+
+// Packs the public key.
+func (pk *PublicKey) MarshalBinary() ([]byte, error) {
+	return pk.Bytes(), nil
+}
+
+// Packs the private key.
+func (sk *PrivateKey) MarshalBinary() ([]byte, error) {
+	return sk.Bytes(), nil
+}
+
+// Unpacks the public key from data.
+func (pk *PublicKey) UnmarshalBinary(data []byte) error {
+	if len(data) != PublicKeySize {
+		return errors.New("packed public key must be of mldsa65.PublicKeySize bytes")
+	}
+	var buf [PublicKeySize]byte
+	copy(buf[:], data)
+	pk.Unpack(&buf)
+	return nil
+}
+
+// Unpacks the private key from data.
+func (sk *PrivateKey) UnmarshalBinary(data []byte) error {
+	if len(data) != PrivateKeySize {
+		return errors.New("packed private key must be of mldsa65.PrivateKeySize bytes")
+	}
+	var buf [PrivateKeySize]byte
+	copy(buf[:], data)
+	sk.Unpack(&buf)
+	return nil
+}
+
+// Sign signs the given message.
+//
+// opts.HashFunc() must return zero, which can be achieved by passing
+// crypto.Hash(0) for opts.  rand is ignored.  Will only return an error
+// if opts.HashFunc() is non-zero.
+//
+// This function is used to make PrivateKey implement the crypto.Signer
+// interface.  The package-level SignTo function might be more convenient
+// to use.
+func (sk *PrivateKey) Sign(rand io.Reader, msg []byte, opts crypto.SignerOpts) (
+	sig []byte, err error) {
+	var ret [SignatureSize]byte
+
+	if opts.HashFunc() != crypto.Hash(0) {
+		return nil, errors.New("dilithium: cannot sign hashed message")
+	}
+	if err = SignTo(sk, msg, nil, false, ret[:]); err != nil {
+		return nil, err
+	}
+
+	return ret[:], nil
+}
+
+// Computes the public key corresponding to this private key.
+//
+// Returns a *PublicKey.  The type crypto.PublicKey is used to make
+// PrivateKey implement the crypto.Signer interface.
+func (sk *PrivateKey) Public() crypto.PublicKey {
+	return (*PublicKey)((*internal.PrivateKey)(sk).Public())
+}
+
+// Equal returns whether the two private keys equal.
+func (sk *PrivateKey) Equal(other crypto.PrivateKey) bool {
+	castOther, ok := other.(*PrivateKey)
+	if !ok {
+		return false
+	}
+	return (*internal.PrivateKey)(sk).Equal((*internal.PrivateKey)(castOther))
+}
+
+// Equal returns whether the two public keys equal.
+func (pk *PublicKey) Equal(other crypto.PublicKey) bool {
+	castOther, ok := other.(*PublicKey)
+	if !ok {
+		return false
+	}
+	return (*internal.PublicKey)(pk).Equal((*internal.PublicKey)(castOther))
+}
+
+// Boilerplate for generic signatures API
+
+type scheme struct{}
+
+var sch sign.Scheme = &scheme{}
+
+// Scheme returns a generic signature interface for ML-DSA-65.
+func Scheme() sign.Scheme { return sch }
+
+func (*scheme) Name() string        { return "ML-DSA-65" }
+func (*scheme) PublicKeySize() int  { return PublicKeySize }
+func (*scheme) PrivateKeySize() int { return PrivateKeySize }
+func (*scheme) SignatureSize() int  { return SignatureSize }
+func (*scheme) SeedSize() int       { return SeedSize }
+
+// TODO TLSIdentifier()
+func (*scheme) Oid() asn1.ObjectIdentifier {
+	return asn1.ObjectIdentifier{2, 16, 840, 1, 101, 3, 4, 18}
+}
+
+func (*scheme) SupportsContext() bool {
+	return true
+}
+
+func (*scheme) GenerateKey() (sign.PublicKey, sign.PrivateKey, error) {
+	return GenerateKey(nil)
+}
+
+func (*scheme) Sign(
+	sk sign.PrivateKey,
+	msg []byte,
+	opts *sign.SignatureOpts,
+) []byte {
+	var ctx []byte
+	sig := make([]byte, SignatureSize)
+
+	priv, ok := sk.(*PrivateKey)
+	if !ok {
+		panic(sign.ErrTypeMismatch)
+	}
+	if opts != nil && opts.Context != "" {
+		ctx = []byte(opts.Context)
+	}
+	err := SignTo(priv, msg, ctx, false, sig)
+	if err != nil {
+		panic(err)
+	}
+
+	return sig
+}
+
+func (*scheme) Verify(
+	pk sign.PublicKey,
+	msg, sig []byte,
+	opts *sign.SignatureOpts,
+) bool {
+	var ctx []byte
+	pub, ok := pk.(*PublicKey)
+	if !ok {
+		panic(sign.ErrTypeMismatch)
+	}
+	if opts != nil && opts.Context != "" {
+		ctx = []byte(opts.Context)
+	}
+	return Verify(pub, msg, ctx, sig)
+}
+
+func (*scheme) DeriveKey(seed []byte) (sign.PublicKey, sign.PrivateKey) {
+	if len(seed) != SeedSize {
+		panic(sign.ErrSeedSize)
+	}
+	var seed2 [SeedSize]byte
+	copy(seed2[:], seed)
+	return NewKeyFromSeed(&seed2)
+}
+
+func (*scheme) UnmarshalBinaryPublicKey(buf []byte) (sign.PublicKey, error) {
+	if len(buf) != PublicKeySize {
+		return nil, sign.ErrPubKeySize
+	}
+
+	var (
+		buf2 [PublicKeySize]byte
+		ret  PublicKey
+	)
+
+	copy(buf2[:], buf)
+	ret.Unpack(&buf2)
+	return &ret, nil
+}
+
+func (*scheme) UnmarshalBinaryPrivateKey(buf []byte) (sign.PrivateKey, error) {
+	if len(buf) != PrivateKeySize {
+		return nil, sign.ErrPrivKeySize
+	}
+
+	var (
+		buf2 [PrivateKeySize]byte
+		ret  PrivateKey
+	)
+
+	copy(buf2[:], buf)
+	ret.Unpack(&buf2)
+	return &ret, nil
+}
+
+func (sk *PrivateKey) Scheme() sign.Scheme {
+	return sch
+}
+
+func (sk *PublicKey) Scheme() sign.Scheme {
+	return sch
+}
diff --git a/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/dilithium.go b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/dilithium.go
new file mode 100644
index 0000000000..8f1c8e5cbf
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/dilithium.go
@@ -0,0 +1,491 @@
+// Code generated from mode3/internal/dilithium.go by gen.go
+
+package internal
+
+import (
+	cryptoRand "crypto/rand"
+	"crypto/subtle"
+	"io"
+
+	"github.com/cloudflare/circl/internal/sha3"
+	common "github.com/cloudflare/circl/sign/internal/dilithium"
+)
+
+const (
+	// Size of a packed polynomial of norm ≤η.
+	// (Note that the  formula is not valid in general.)
+	PolyLeqEtaSize = (common.N * DoubleEtaBits) / 8
+
+	// β = τη, the maximum size of c s₂.
+	Beta = Tau * Eta
+
+	// γ₁ range of y
+	Gamma1 = 1 << Gamma1Bits
+
+	// Size of packed polynomial of norm <γ₁ such as z
+	PolyLeGamma1Size = (Gamma1Bits + 1) * common.N / 8
+
+	// α = 2γ₂ parameter for decompose
+	Alpha = 2 * Gamma2
+
+	// Size of a packed private key
+	PrivateKeySize = 32 + 32 + TRSize + PolyLeqEtaSize*(L+K) + common.PolyT0Size*K
+
+	// Size of a packed public key
+	PublicKeySize = 32 + common.PolyT1Size*K
+
+	// Size of a packed signature
+	SignatureSize = L*PolyLeGamma1Size + Omega + K + CTildeSize
+
+	// Size of packed w₁
+	PolyW1Size = (common.N * (common.QBits - Gamma1Bits)) / 8
+)
+
+// PublicKey is the type of Dilithium public keys.
+type PublicKey struct {
+	rho [32]byte
+	t1  VecK
+
+	// Cached values
+	t1p [common.PolyT1Size * K]byte
+	A   *Mat
+	tr  *[TRSize]byte
+}
+
+// PrivateKey is the type of Dilithium private keys.
+type PrivateKey struct {
+	rho [32]byte
+	key [32]byte
+	s1  VecL
+	s2  VecK
+	t0  VecK
+	tr  [TRSize]byte
+
+	// Cached values
+	A   Mat  // ExpandA(ρ)
+	s1h VecL // NTT(s₁)
+	s2h VecK // NTT(s₂)
+	t0h VecK // NTT(t₀)
+}
+
+type unpackedSignature struct {
+	z    VecL
+	hint VecK
+	c    [CTildeSize]byte
+}
+
+// Packs the signature into buf.
+func (sig *unpackedSignature) Pack(buf []byte) {
+	copy(buf[:], sig.c[:])
+	sig.z.PackLeGamma1(buf[CTildeSize:])
+	sig.hint.PackHint(buf[CTildeSize+L*PolyLeGamma1Size:])
+}
+
+// Sets sig to the signature encoded in the buffer.
+//
+// Returns whether buf contains a properly packed signature.
+func (sig *unpackedSignature) Unpack(buf []byte) bool {
+	if len(buf) < SignatureSize {
+		return false
+	}
+	copy(sig.c[:], buf[:])
+	sig.z.UnpackLeGamma1(buf[CTildeSize:])
+	if sig.z.Exceeds(Gamma1 - Beta) {
+		return false
+	}
+	if !sig.hint.UnpackHint(buf[CTildeSize+L*PolyLeGamma1Size:]) {
+		return false
+	}
+	return true
+}
+
+// Packs the public key into buf.
+func (pk *PublicKey) Pack(buf *[PublicKeySize]byte) {
+	copy(buf[:32], pk.rho[:])
+	copy(buf[32:], pk.t1p[:])
+}
+
+// Sets pk to the public key encoded in buf.
+func (pk *PublicKey) Unpack(buf *[PublicKeySize]byte) {
+	copy(pk.rho[:], buf[:32])
+	copy(pk.t1p[:], buf[32:])
+
+	pk.t1.UnpackT1(pk.t1p[:])
+	pk.A = new(Mat)
+	pk.A.Derive(&pk.rho)
+
+	// tr = CRH(ρ ‖ t1) = CRH(pk)
+	pk.tr = new([TRSize]byte)
+	h := sha3.NewShake256()
+	_, _ = h.Write(buf[:])
+	_, _ = h.Read(pk.tr[:])
+}
+
+// Packs the private key into buf.
+func (sk *PrivateKey) Pack(buf *[PrivateKeySize]byte) {
+	copy(buf[:32], sk.rho[:])
+	copy(buf[32:64], sk.key[:])
+	copy(buf[64:64+TRSize], sk.tr[:])
+	offset := 64 + TRSize
+	sk.s1.PackLeqEta(buf[offset:])
+	offset += PolyLeqEtaSize * L
+	sk.s2.PackLeqEta(buf[offset:])
+	offset += PolyLeqEtaSize * K
+	sk.t0.PackT0(buf[offset:])
+}
+
+// Sets sk to the private key encoded in buf.
+func (sk *PrivateKey) Unpack(buf *[PrivateKeySize]byte) {
+	copy(sk.rho[:], buf[:32])
+	copy(sk.key[:], buf[32:64])
+	copy(sk.tr[:], buf[64:64+TRSize])
+	offset := 64 + TRSize
+	sk.s1.UnpackLeqEta(buf[offset:])
+	offset += PolyLeqEtaSize * L
+	sk.s2.UnpackLeqEta(buf[offset:])
+	offset += PolyLeqEtaSize * K
+	sk.t0.UnpackT0(buf[offset:])
+
+	// Cached values
+	sk.A.Derive(&sk.rho)
+	sk.t0h = sk.t0
+	sk.t0h.NTT()
+	sk.s1h = sk.s1
+	sk.s1h.NTT()
+	sk.s2h = sk.s2
+	sk.s2h.NTT()
+}
+
+// GenerateKey generates a public/private key pair using entropy from rand.
+// If rand is nil, crypto/rand.Reader will be used.
+func GenerateKey(rand io.Reader) (*PublicKey, *PrivateKey, error) {
+	var seed [32]byte
+	if rand == nil {
+		rand = cryptoRand.Reader
+	}
+	_, err := io.ReadFull(rand, seed[:])
+	if err != nil {
+		return nil, nil, err
+	}
+	pk, sk := NewKeyFromSeed(&seed)
+	return pk, sk, nil
+}
+
+// NewKeyFromSeed derives a public/private key pair using the given seed.
+func NewKeyFromSeed(seed *[common.SeedSize]byte) (*PublicKey, *PrivateKey) {
+	var eSeed [128]byte // expanded seed
+	var pk PublicKey
+	var sk PrivateKey
+	var sSeed [64]byte
+
+	h := sha3.NewShake256()
+	_, _ = h.Write(seed[:])
+
+	if NIST {
+		_, _ = h.Write([]byte{byte(K), byte(L)})
+	}
+
+	_, _ = h.Read(eSeed[:])
+
+	copy(pk.rho[:], eSeed[:32])
+	copy(sSeed[:], eSeed[32:96])
+	copy(sk.key[:], eSeed[96:])
+	copy(sk.rho[:], pk.rho[:])
+
+	sk.A.Derive(&pk.rho)
+
+	for i := uint16(0); i < L; i++ {
+		PolyDeriveUniformLeqEta(&sk.s1[i], &sSeed, i)
+	}
+
+	for i := uint16(0); i < K; i++ {
+		PolyDeriveUniformLeqEta(&sk.s2[i], &sSeed, i+L)
+	}
+
+	sk.s1h = sk.s1
+	sk.s1h.NTT()
+	sk.s2h = sk.s2
+	sk.s2h.NTT()
+
+	sk.computeT0andT1(&sk.t0, &pk.t1)
+
+	sk.t0h = sk.t0
+	sk.t0h.NTT()
+
+	// Complete public key far enough to be packed
+	pk.t1.PackT1(pk.t1p[:])
+	pk.A = &sk.A
+
+	// Finish private key
+	var packedPk [PublicKeySize]byte
+	pk.Pack(&packedPk)
+
+	// tr = CRH(ρ ‖ t1) = CRH(pk)
+	h.Reset()
+	_, _ = h.Write(packedPk[:])
+	_, _ = h.Read(sk.tr[:])
+
+	// Finish cache of public key
+	pk.tr = &sk.tr
+
+	return &pk, &sk
+}
+
+// Computes t0 and t1 from sk.s1h, sk.s2 and sk.A.
+func (sk *PrivateKey) computeT0andT1(t0, t1 *VecK) {
+	var t VecK
+
+	// Set t to A s₁ + s₂
+	for i := 0; i < K; i++ {
+		PolyDotHat(&t[i], &sk.A[i], &sk.s1h)
+		t[i].ReduceLe2Q()
+		t[i].InvNTT()
+	}
+	t.Add(&t, &sk.s2)
+	t.Normalize()
+
+	// Compute t₀, t₁ = Power2Round(t)
+	t.Power2Round(t0, t1)
+}
+
+// Verify checks whether the given signature by pk on msg is valid.
+//
+// For Dilithium this is the top-level verification function.
+// In ML-DSA, this is ML-DSA.Verify_internal.
+func Verify(pk *PublicKey, msg func(io.Writer), signature []byte) bool {
+	var sig unpackedSignature
+	var mu [64]byte
+	var zh VecL
+	var Az, Az2dct1, w1 VecK
+	var ch common.Poly
+	var cp [CTildeSize]byte
+	var w1Packed [PolyW1Size * K]byte
+
+	// Note that Unpack() checked whether ‖z‖_∞ < γ₁ - β
+	// and ensured that there at most ω ones in pk.hint.
+	if !sig.Unpack(signature) {
+		return false
+	}
+
+	// μ = CRH(tr ‖ msg)
+	h := sha3.NewShake256()
+	_, _ = h.Write(pk.tr[:])
+	msg(&h)
+	_, _ = h.Read(mu[:])
+
+	// Compute Az
+	zh = sig.z
+	zh.NTT()
+
+	for i := 0; i < K; i++ {
+		PolyDotHat(&Az[i], &pk.A[i], &zh)
+	}
+
+	// Next, we compute Az - 2ᵈ·c·t₁.
+	// Note that the coefficients of t₁ are bounded by 256 = 2⁹,
+	// so the coefficients of Az2dct1 will bounded by 2⁹⁺ᵈ = 2²³ < 2q,
+	// which is small enough for NTT().
+	Az2dct1.MulBy2toD(&pk.t1)
+	Az2dct1.NTT()
+	PolyDeriveUniformBall(&ch, sig.c[:])
+	ch.NTT()
+	for i := 0; i < K; i++ {
+		Az2dct1[i].MulHat(&Az2dct1[i], &ch)
+	}
+	Az2dct1.Sub(&Az, &Az2dct1)
+	Az2dct1.ReduceLe2Q()
+	Az2dct1.InvNTT()
+	Az2dct1.NormalizeAssumingLe2Q()
+
+	// UseHint(pk.hint, Az - 2ᵈ·c·t₁)
+	//    = UseHint(pk.hint, w - c·s₂ + c·t₀)
+	//    = UseHint(pk.hint, r + c·t₀)
+	//    = r₁ = w₁.
+	w1.UseHint(&Az2dct1, &sig.hint)
+	w1.PackW1(w1Packed[:])
+
+	// c' = H(μ, w₁)
+	h.Reset()
+	_, _ = h.Write(mu[:])
+	_, _ = h.Write(w1Packed[:])
+	_, _ = h.Read(cp[:])
+
+	return sig.c == cp
+}
+
+// SignTo signs the given message and writes the signature into signature.
+//
+// For Dilithium this is the top-level signing function. For ML-DSA
+// this is ML-DSA.Sign_internal.
+//
+//nolint:funlen
+func SignTo(sk *PrivateKey, msg func(io.Writer), rnd [32]byte, signature []byte) {
+	var mu, rhop [64]byte
+	var w1Packed [PolyW1Size * K]byte
+	var y, yh VecL
+	var w, w0, w1, w0mcs2, ct0, w0mcs2pct0 VecK
+	var ch common.Poly
+	var yNonce uint16
+	var sig unpackedSignature
+
+	if len(signature) < SignatureSize {
+		panic("Signature does not fit in that byteslice")
+	}
+
+	//  μ = CRH(tr ‖ msg)
+	h := sha3.NewShake256()
+	_, _ = h.Write(sk.tr[:])
+	msg(&h)
+	_, _ = h.Read(mu[:])
+
+	// ρ' = CRH(key ‖ μ)
+	h.Reset()
+	_, _ = h.Write(sk.key[:])
+	if NIST {
+		_, _ = h.Write(rnd[:])
+	}
+	_, _ = h.Write(mu[:])
+	_, _ = h.Read(rhop[:])
+
+	// Main rejection loop
+	attempt := 0
+	for {
+		attempt++
+		if attempt >= 576 {
+			// Depending on the mode, one try has a chance between 1/7 and 1/4
+			// of succeeding.  Thus it is safe to say that 576 iterations
+			// are enough as (6/7)⁵⁷⁶ < 2⁻¹²⁸.
+			panic("This should only happen 1 in  2^{128}: something is wrong.")
+		}
+
+		// y = ExpandMask(ρ', key)
+		VecLDeriveUniformLeGamma1(&y, &rhop, yNonce)
+		yNonce += uint16(L)
+
+		// Set w to A y
+		yh = y
+		yh.NTT()
+		for i := 0; i < K; i++ {
+			PolyDotHat(&w[i], &sk.A[i], &yh)
+			w[i].ReduceLe2Q()
+			w[i].InvNTT()
+		}
+
+		// Decompose w into w₀ and w₁
+		w.NormalizeAssumingLe2Q()
+		w.Decompose(&w0, &w1)
+
+		// c~ = H(μ ‖ w₁)
+		w1.PackW1(w1Packed[:])
+		h.Reset()
+		_, _ = h.Write(mu[:])
+		_, _ = h.Write(w1Packed[:])
+		_, _ = h.Read(sig.c[:])
+
+		PolyDeriveUniformBall(&ch, sig.c[:])
+		ch.NTT()
+
+		// Ensure ‖ w₀ - c·s2 ‖_∞ < γ₂ - β.
+		//
+		// By Lemma 3 of the specification this is equivalent to checking that
+		// both ‖ r₀ ‖_∞ < γ₂ - β and r₁ = w₁, for the decomposition
+		// w - c·s₂	 = r₁ α + r₀ as computed by decompose().
+		// See also §4.1 of the specification.
+		for i := 0; i < K; i++ {
+			w0mcs2[i].MulHat(&ch, &sk.s2h[i])
+			w0mcs2[i].InvNTT()
+		}
+		w0mcs2.Sub(&w0, &w0mcs2)
+		w0mcs2.Normalize()
+
+		if w0mcs2.Exceeds(Gamma2 - Beta) {
+			continue
+		}
+
+		// z = y + c·s₁
+		for i := 0; i < L; i++ {
+			sig.z[i].MulHat(&ch, &sk.s1h[i])
+			sig.z[i].InvNTT()
+		}
+		sig.z.Add(&sig.z, &y)
+		sig.z.Normalize()
+
+		// Ensure  ‖z‖_∞ < γ₁ - β
+		if sig.z.Exceeds(Gamma1 - Beta) {
+			continue
+		}
+
+		// Compute c·t₀
+		for i := 0; i < K; i++ {
+			ct0[i].MulHat(&ch, &sk.t0h[i])
+			ct0[i].InvNTT()
+		}
+		ct0.NormalizeAssumingLe2Q()
+
+		// Ensure ‖c·t₀‖_∞ < γ₂.
+		if ct0.Exceeds(Gamma2) {
+			continue
+		}
+
+		// Create the hint to be able to reconstruct w₁ from w - c·s₂ + c·t0.
+		// Note that we're not using makeHint() in the obvious way as we
+		// do not know whether ‖ sc·s₂ - c·t₀ ‖_∞ < γ₂.  Instead we note
+		// that our makeHint() is actually the same as a makeHint for a
+		// different decomposition:
+		//
+		// Earlier we ensured indirectly with a check that r₁ = w₁ where
+		// r = w - c·s₂.  Hence r₀ = r - r₁ α = w - c·s₂ - w₁ α = w₀ - c·s₂.
+		// Thus  MakeHint(w₀ - c·s₂ + c·t₀, w₁) = MakeHint(r0 + c·t₀, r₁)
+		// and UseHint(w - c·s₂ + c·t₀, w₁) = UseHint(r + c·t₀, r₁).
+		// As we just ensured that ‖ c·t₀ ‖_∞ < γ₂ our usage is correct.
+		w0mcs2pct0.Add(&w0mcs2, &ct0)
+		w0mcs2pct0.NormalizeAssumingLe2Q()
+		hintPop := sig.hint.MakeHint(&w0mcs2pct0, &w1)
+		if hintPop > Omega {
+			continue
+		}
+
+		break
+	}
+
+	sig.Pack(signature[:])
+}
+
+// Computes the public key corresponding to this private key.
+func (sk *PrivateKey) Public() *PublicKey {
+	var t0 VecK
+	pk := &PublicKey{
+		rho: sk.rho,
+		A:   &sk.A,
+		tr:  &sk.tr,
+	}
+	sk.computeT0andT1(&t0, &pk.t1)
+	pk.t1.PackT1(pk.t1p[:])
+	return pk
+}
+
+// Equal returns whether the two public keys are equal
+func (pk *PublicKey) Equal(other *PublicKey) bool {
+	return pk.rho == other.rho && pk.t1 == other.t1
+}
+
+// Equal returns whether the two private keys are equal
+func (sk *PrivateKey) Equal(other *PrivateKey) bool {
+	ret := (subtle.ConstantTimeCompare(sk.rho[:], other.rho[:]) &
+		subtle.ConstantTimeCompare(sk.key[:], other.key[:]) &
+		subtle.ConstantTimeCompare(sk.tr[:], other.tr[:]))
+
+	acc := uint32(0)
+	for i := 0; i < L; i++ {
+		for j := 0; j < common.N; j++ {
+			acc |= sk.s1[i][j] ^ other.s1[i][j]
+		}
+	}
+	for i := 0; i < K; i++ {
+		for j := 0; j < common.N; j++ {
+			acc |= sk.s2[i][j] ^ other.s2[i][j]
+			acc |= sk.t0[i][j] ^ other.t0[i][j]
+		}
+	}
+	return (ret & subtle.ConstantTimeEq(int32(acc), 0)) == 1
+}
diff --git a/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/mat.go b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/mat.go
new file mode 100644
index 0000000000..ceaf634fa7
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/mat.go
@@ -0,0 +1,59 @@
+// Code generated from mode3/internal/mat.go by gen.go
+
+package internal
+
+import (
+	common "github.com/cloudflare/circl/sign/internal/dilithium"
+)
+
+// A k by l matrix of polynomials.
+type Mat [K]VecL
+
+// Expands the given seed to a complete matrix.
+//
+// This function is called ExpandA in the specification.
+func (m *Mat) Derive(seed *[32]byte) {
+	if !DeriveX4Available {
+		for i := uint16(0); i < K; i++ {
+			for j := uint16(0); j < L; j++ {
+				PolyDeriveUniform(&m[i][j], seed, (i<<8)+j)
+			}
+		}
+		return
+	}
+
+	idx := 0
+	var nonces [4]uint16
+	var ps [4]*common.Poly
+	for i := uint16(0); i < K; i++ {
+		for j := uint16(0); j < L; j++ {
+			nonces[idx] = (i << 8) + j
+			ps[idx] = &m[i][j]
+			idx++
+			if idx == 4 {
+				idx = 0
+				PolyDeriveUniformX4(ps, seed, nonces)
+			}
+		}
+	}
+	if idx != 0 {
+		for i := idx; i < 4; i++ {
+			ps[i] = nil
+		}
+		PolyDeriveUniformX4(ps, seed, nonces)
+	}
+}
+
+// Set p to the inner product of a and b using pointwise multiplication.
+//
+// Assumes a and b are in Montgomery form and their coefficients are
+// pairwise sufficiently small to multiply, see Poly.MulHat().  Resulting
+// coefficients are bounded by 2Lq.
+func PolyDotHat(p *common.Poly, a, b *VecL) {
+	var t common.Poly
+	*p = common.Poly{} // zero p
+	for i := 0; i < L; i++ {
+		t.MulHat(&a[i], &b[i])
+		p.Add(&t, p)
+	}
+}
diff --git a/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/pack.go b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/pack.go
new file mode 100644
index 0000000000..1854b41973
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/pack.go
@@ -0,0 +1,270 @@
+// Code generated from mode3/internal/pack.go by gen.go
+
+package internal
+
+import (
+	common "github.com/cloudflare/circl/sign/internal/dilithium"
+)
+
+// Writes p with norm less than or equal η into buf, which must be of
+// size PolyLeqEtaSize.
+//
+// Assumes coefficients of p are not normalized, but in [q-η,q+η].
+func PolyPackLeqEta(p *common.Poly, buf []byte) {
+	if DoubleEtaBits == 4 { // compiler eliminates branch
+		j := 0
+		for i := 0; i < PolyLeqEtaSize; i++ {
+			buf[i] = (byte(common.Q+Eta-p[j]) |
+				byte(common.Q+Eta-p[j+1])<<4)
+			j += 2
+		}
+	} else if DoubleEtaBits == 3 {
+		j := 0
+		for i := 0; i < PolyLeqEtaSize; i += 3 {
+			buf[i] = (byte(common.Q+Eta-p[j]) |
+				(byte(common.Q+Eta-p[j+1]) << 3) |
+				(byte(common.Q+Eta-p[j+2]) << 6))
+			buf[i+1] = ((byte(common.Q+Eta-p[j+2]) >> 2) |
+				(byte(common.Q+Eta-p[j+3]) << 1) |
+				(byte(common.Q+Eta-p[j+4]) << 4) |
+				(byte(common.Q+Eta-p[j+5]) << 7))
+			buf[i+2] = ((byte(common.Q+Eta-p[j+5]) >> 1) |
+				(byte(common.Q+Eta-p[j+6]) << 2) |
+				(byte(common.Q+Eta-p[j+7]) << 5))
+			j += 8
+		}
+	} else {
+		panic("eta not supported")
+	}
+}
+
+// Sets p to the polynomial of norm less than or equal η encoded in the
+// given buffer of size PolyLeqEtaSize.
+//
+// Output coefficients of p are not normalized, but in [q-η,q+η] provided
+// buf was created using PackLeqEta.
+//
+// Beware, for arbitrary buf the coefficients of p might end up in
+// the interval [q-2^b,q+2^b] where b is the least b with η≤2^b.
+func PolyUnpackLeqEta(p *common.Poly, buf []byte) {
+	if DoubleEtaBits == 4 { // compiler eliminates branch
+		j := 0
+		for i := 0; i < PolyLeqEtaSize; i++ {
+			p[j] = common.Q + Eta - uint32(buf[i]&15)
+			p[j+1] = common.Q + Eta - uint32(buf[i]>>4)
+			j += 2
+		}
+	} else if DoubleEtaBits == 3 {
+		j := 0
+		for i := 0; i < PolyLeqEtaSize; i += 3 {
+			p[j] = common.Q + Eta - uint32(buf[i]&7)
+			p[j+1] = common.Q + Eta - uint32((buf[i]>>3)&7)
+			p[j+2] = common.Q + Eta - uint32((buf[i]>>6)|((buf[i+1]<<2)&7))
+			p[j+3] = common.Q + Eta - uint32((buf[i+1]>>1)&7)
+			p[j+4] = common.Q + Eta - uint32((buf[i+1]>>4)&7)
+			p[j+5] = common.Q + Eta - uint32((buf[i+1]>>7)|((buf[i+2]<<1)&7))
+			p[j+6] = common.Q + Eta - uint32((buf[i+2]>>2)&7)
+			p[j+7] = common.Q + Eta - uint32((buf[i+2]>>5)&7)
+			j += 8
+		}
+	} else {
+		panic("eta not supported")
+	}
+}
+
+// Writes v with coefficients in {0, 1} of which at most ω non-zero
+// to buf, which must have length ω+k.
+func (v *VecK) PackHint(buf []byte) {
+	// The packed hint starts with the indices of the non-zero coefficients
+	// For instance:
+	//
+	//    (x⁵⁶ + x¹⁰⁰, x²⁵⁵, 0, x² + x²³, x¹)
+	//
+	// Yields
+	//
+	//  56, 100, 255, 2, 23, 1
+	//
+	// Then we pad with zeroes until we have a list of ω items:
+	// //  56, 100, 255, 2, 23, 1, 0, 0, ..., 0
+	//
+	// Then we finish with a list of the switch-over-indices in this
+	// list between polynomials, so:
+	//
+	//  56, 100, 255, 2, 23, 1, 0, 0, ..., 0, 2, 3, 3, 5, 6
+
+	off := uint8(0)
+	for i := 0; i < K; i++ {
+		for j := uint16(0); j < common.N; j++ {
+			if v[i][j] != 0 {
+				buf[off] = uint8(j)
+				off++
+			}
+		}
+		buf[Omega+i] = off
+	}
+	for ; off < Omega; off++ {
+		buf[off] = 0
+	}
+}
+
+// Sets v to the vector encoded using VecK.PackHint()
+//
+// Returns whether unpacking was successful.
+func (v *VecK) UnpackHint(buf []byte) bool {
+	// A priori, there would be several reasonable ways to encode the same
+	// hint vector.  We take care to only allow only one encoding, to ensure
+	// "strong unforgeability".
+	//
+	// See PackHint() source for description of the encoding.
+	*v = VecK{}         // zero v
+	prevSOP := uint8(0) // previous switch-over-point
+	for i := 0; i < K; i++ {
+		SOP := buf[Omega+i]
+		if SOP < prevSOP || SOP > Omega {
+			return false // ensures switch-over-points are increasing
+		}
+		for j := prevSOP; j < SOP; j++ {
+			if j > prevSOP && buf[j] <= buf[j-1] {
+				return false // ensures indices are increasing (within a poly)
+			}
+			v[i][buf[j]] = 1
+		}
+		prevSOP = SOP
+	}
+	for j := prevSOP; j < Omega; j++ {
+		if buf[j] != 0 {
+			return false // ensures padding indices are zero
+		}
+	}
+
+	return true
+}
+
+// Sets p to the polynomial packed into buf by PolyPackLeGamma1.
+//
+// p will be normalized.
+func PolyUnpackLeGamma1(p *common.Poly, buf []byte) {
+	if Gamma1Bits == 17 {
+		j := 0
+		for i := 0; i < PolyLeGamma1Size; i += 9 {
+			p0 := uint32(buf[i]) | (uint32(buf[i+1]) << 8) |
+				(uint32(buf[i+2]&0x3) << 16)
+			p1 := uint32(buf[i+2]>>2) | (uint32(buf[i+3]) << 6) |
+				(uint32(buf[i+4]&0xf) << 14)
+			p2 := uint32(buf[i+4]>>4) | (uint32(buf[i+5]) << 4) |
+				(uint32(buf[i+6]&0x3f) << 12)
+			p3 := uint32(buf[i+6]>>6) | (uint32(buf[i+7]) << 2) |
+				(uint32(buf[i+8]) << 10)
+
+			// coefficients in [0,…,2γ₁)
+			p0 = Gamma1 - p0 // (-γ₁,…,γ₁]
+			p1 = Gamma1 - p1
+			p2 = Gamma1 - p2
+			p3 = Gamma1 - p3
+
+			p0 += uint32(int32(p0)>>31) & common.Q // normalize
+			p1 += uint32(int32(p1)>>31) & common.Q
+			p2 += uint32(int32(p2)>>31) & common.Q
+			p3 += uint32(int32(p3)>>31) & common.Q
+
+			p[j] = p0
+			p[j+1] = p1
+			p[j+2] = p2
+			p[j+3] = p3
+
+			j += 4
+		}
+	} else if Gamma1Bits == 19 {
+		j := 0
+		for i := 0; i < PolyLeGamma1Size; i += 5 {
+			p0 := uint32(buf[i]) | (uint32(buf[i+1]) << 8) |
+				(uint32(buf[i+2]&0xf) << 16)
+			p1 := uint32(buf[i+2]>>4) | (uint32(buf[i+3]) << 4) |
+				(uint32(buf[i+4]) << 12)
+
+			p0 = Gamma1 - p0
+			p1 = Gamma1 - p1
+
+			p0 += uint32(int32(p0)>>31) & common.Q
+			p1 += uint32(int32(p1)>>31) & common.Q
+
+			p[j] = p0
+			p[j+1] = p1
+
+			j += 2
+		}
+	} else {
+		panic("γ₁ not supported")
+	}
+}
+
+// Writes p whose coefficients are in (-γ₁,γ₁] into buf
+// which has to be of length PolyLeGamma1Size.
+//
+// Assumes p is normalized.
+func PolyPackLeGamma1(p *common.Poly, buf []byte) {
+	if Gamma1Bits == 17 {
+		j := 0
+		// coefficients in [0,…,γ₁] ∪ (q-γ₁,…,q)
+		for i := 0; i < PolyLeGamma1Size; i += 9 {
+			p0 := Gamma1 - p[j]                    // [0,…,γ₁] ∪ (γ₁-q,…,2γ₁-q)
+			p0 += uint32(int32(p0)>>31) & common.Q // [0,…,2γ₁)
+			p1 := Gamma1 - p[j+1]
+			p1 += uint32(int32(p1)>>31) & common.Q
+			p2 := Gamma1 - p[j+2]
+			p2 += uint32(int32(p2)>>31) & common.Q
+			p3 := Gamma1 - p[j+3]
+			p3 += uint32(int32(p3)>>31) & common.Q
+
+			buf[i+0] = byte(p0)
+			buf[i+1] = byte(p0 >> 8)
+			buf[i+2] = byte(p0>>16) | byte(p1<<2)
+			buf[i+3] = byte(p1 >> 6)
+			buf[i+4] = byte(p1>>14) | byte(p2<<4)
+			buf[i+5] = byte(p2 >> 4)
+			buf[i+6] = byte(p2>>12) | byte(p3<<6)
+			buf[i+7] = byte(p3 >> 2)
+			buf[i+8] = byte(p3 >> 10)
+
+			j += 4
+		}
+	} else if Gamma1Bits == 19 {
+		j := 0
+		for i := 0; i < PolyLeGamma1Size; i += 5 {
+			// Coefficients are in [0, γ₁] ∪ (Q-γ₁, Q)
+			p0 := Gamma1 - p[j]
+			p0 += uint32(int32(p0)>>31) & common.Q
+			p1 := Gamma1 - p[j+1]
+			p1 += uint32(int32(p1)>>31) & common.Q
+
+			buf[i+0] = byte(p0)
+			buf[i+1] = byte(p0 >> 8)
+			buf[i+2] = byte(p0>>16) | byte(p1<<4)
+			buf[i+3] = byte(p1 >> 4)
+			buf[i+4] = byte(p1 >> 12)
+
+			j += 2
+		}
+	} else {
+		panic("γ₁ not supported")
+	}
+}
+
+// Pack w₁ into buf, which must be of length PolyW1Size.
+//
+// Assumes w₁ is normalized.
+func PolyPackW1(p *common.Poly, buf []byte) {
+	if Gamma1Bits == 19 {
+		p.PackLe16(buf)
+	} else if Gamma1Bits == 17 {
+		j := 0
+		for i := 0; i < PolyW1Size; i += 3 {
+			buf[i] = byte(p[j]) | byte(p[j+1]<<6)
+			buf[i+1] = byte(p[j+1]>>2) | byte(p[j+2]<<4)
+			buf[i+2] = byte(p[j+2]>>4) | byte(p[j+3]<<2)
+			j += 4
+		}
+	} else {
+		panic("unsupported γ₁")
+	}
+}
diff --git a/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/params.go b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/params.go
new file mode 100644
index 0000000000..8a1f866e65
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/params.go
@@ -0,0 +1,18 @@
+// Code generated from params.templ.go. DO NOT EDIT.
+
+package internal
+
+const (
+	Name          = "ML-DSA-65"
+	K             = 6
+	L             = 5
+	Eta           = 4
+	DoubleEtaBits = 4
+	Omega         = 55
+	Tau           = 49
+	Gamma1Bits    = 19
+	Gamma2        = 261888
+	NIST          = true
+	TRSize        = 64
+	CTildeSize    = 48
+)
diff --git a/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/rounding.go b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/rounding.go
new file mode 100644
index 0000000000..58123c090b
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/rounding.go
@@ -0,0 +1,142 @@
+// Code generated from mode3/internal/rounding.go by gen.go
+
+package internal
+
+import (
+	common "github.com/cloudflare/circl/sign/internal/dilithium"
+)
+
+// Splits 0 ≤ a < q into a₀ and a₁ with a = a₁*α + a₀ with -α/2 < a₀ ≤ α/2,
+// except for when we would have a₁ = (q-1)/α in which case a₁=0 is taken
+// and -α/2 ≤ a₀ < 0.  Returns a₀ + q.  Note 0 ≤ a₁ < (q-1)/α.
+// Recall α = 2γ₂.
+func decompose(a uint32) (a0plusQ, a1 uint32) {
+	// a₁ = ⌈a / 128⌉
+	a1 = (a + 127) >> 7
+
+	if Alpha == 523776 {
+		// 1025/2²² is close enough to 1/4092 so that a₁
+		// becomes a/α rounded down.
+		a1 = ((a1*1025 + (1 << 21)) >> 22)
+
+		// For the corner-case a₁ = (q-1)/α = 16, we have to set a₁=0.
+		a1 &= 15
+	} else if Alpha == 190464 {
+		// 1488/2²⁴ is close enough to 1/1488 so that a₁
+		// becomes a/α rounded down.
+		a1 = ((a1 * 11275) + (1 << 23)) >> 24
+
+		// For the corner-case a₁ = (q-1)/α = 44, we have to set a₁=0.
+		a1 ^= uint32(int32(43-a1)>>31) & a1
+	} else {
+		panic("unsupported α")
+	}
+
+	a0plusQ = a - a1*Alpha
+
+	// In the corner-case, when we set a₁=0, we will incorrectly
+	// have a₀ > (q-1)/2 and we'll need to subtract q.  As we
+	// return a₀ + q, that comes down to adding q if a₀ < (q-1)/2.
+	a0plusQ += uint32(int32(a0plusQ-(common.Q-1)/2)>>31) & common.Q
+
+	return
+}
+
+// Assume 0 ≤ r, f < Q with ‖f‖_∞ ≤ α/2.  Decompose r as r = r1*α + r0 as
+// computed by decompose().  Write r' := r - f (mod Q).  Now, decompose
+// r'=r-f again as  r' = r'1*α + r'0 using decompose().  As f is small, we
+// have r'1 = r1 + h, where h ∈ {-1, 0, 1}.  makeHint() computes |h|
+// given z0 := r0 - f (mod Q) and r1.  With |h|, which is called the hint,
+// we can reconstruct r1 using only r' = r - f, which is done by useHint().
+// To wit:
+//
+//	useHint( r - f, makeHint( r0 - f, r1 ) ) = r1.
+//
+// Assumes 0 ≤ z0 < Q.
+func makeHint(z0, r1 uint32) uint32 {
+	// If -α/2 < r0 - f ≤ α/2, then r1*α + r0 - f is a valid decomposition of r'
+	// with the restrictions of decompose() and so r'1 = r1.  So the hint
+	// should be 0. This is covered by the first two inequalities.
+	// There is one other case: if r0 - f = -α/2, then r1*α + r0 - f is also
+	// a valid decomposition if r1 = 0.  In the other cases a one is carried
+	// and the hint should be 1.
+	if z0 <= Gamma2 || z0 > common.Q-Gamma2 || (z0 == common.Q-Gamma2 && r1 == 0) {
+		return 0
+	}
+	return 1
+}
+
+// Uses the hint created by makeHint() to reconstruct r1 from r'=r-f; see
+// documentation of makeHint() for context.
+// Assumes 0 ≤ r' < Q.
+func useHint(rp uint32, hint uint32) uint32 {
+	rp0plusQ, rp1 := decompose(rp)
+	if hint == 0 {
+		return rp1
+	}
+	if rp0plusQ > common.Q {
+		return (rp1 + 1) & 15
+	}
+	return (rp1 - 1) & 15
+}
+
+// Sets p to the hint polynomial for p0 the modified low bits and p1
+// the unmodified high bits --- see makeHint().
+//
+// Returns the number of ones in the hint polynomial.
+func PolyMakeHint(p, p0, p1 *common.Poly) (pop uint32) {
+	for i := 0; i < common.N; i++ {
+		h := makeHint(p0[i], p1[i])
+		pop += h
+		p[i] = h
+	}
+	return
+}
+
+// Computes corrections to the high bits of the polynomial q according
+// to the hints in h and sets p to the corrected high bits.  Returns p.
+func PolyUseHint(p, q, hint *common.Poly) {
+	var q0PlusQ common.Poly
+
+	// See useHint() and makeHint() for an explanation.  We reimplement it
+	// here so that we can call Poly.Decompose(), which might be way faster
+	// than calling decompose() in a loop (for instance when having AVX2.)
+
+	PolyDecompose(q, &q0PlusQ, p)
+
+	for i := 0; i < common.N; i++ {
+		if hint[i] == 0 {
+			continue
+		}
+		if Gamma2 == 261888 {
+			if q0PlusQ[i] > common.Q {
+				p[i] = (p[i] + 1) & 15
+			} else {
+				p[i] = (p[i] - 1) & 15
+			}
+		} else if Gamma2 == 95232 {
+			if q0PlusQ[i] > common.Q {
+				if p[i] == 43 {
+					p[i] = 0
+				} else {
+					p[i]++
+				}
+			} else {
+				if p[i] == 0 {
+					p[i] = 43
+				} else {
+					p[i]--
+				}
+			}
+		} else {
+			panic("unsupported γ₂")
+		}
+	}
+}
+
+// Splits each of the coefficients of p using decompose.
+func PolyDecompose(p, p0PlusQ, p1 *common.Poly) {
+	for i := 0; i < common.N; i++ {
+		p0PlusQ[i], p1[i] = decompose(p[i])
+	}
+}
diff --git a/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/sample.go b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/sample.go
new file mode 100644
index 0000000000..b37370a4ec
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/sample.go
@@ -0,0 +1,339 @@
+// Code generated from mode3/internal/sample.go by gen.go
+
+package internal
+
+import (
+	"encoding/binary"
+
+	"github.com/cloudflare/circl/internal/sha3"
+	common "github.com/cloudflare/circl/sign/internal/dilithium"
+	"github.com/cloudflare/circl/simd/keccakf1600"
+)
+
+// DeriveX4Available indicates whether the system supports the quick fourway
+// sampling variants like PolyDeriveUniformX4.
+var DeriveX4Available = keccakf1600.IsEnabledX4()
+
+// For each i, sample ps[i] uniformly from the given seed and nonces[i].
+// ps[i] may be nil and is ignored in that case.
+//
+// Can only be called when DeriveX4Available is true.
+func PolyDeriveUniformX4(ps [4]*common.Poly, seed *[32]byte, nonces [4]uint16) {
+	var perm keccakf1600.StateX4
+	state := perm.Initialize(false)
+
+	// Absorb the seed in the four states
+	for i := 0; i < 4; i++ {
+		v := binary.LittleEndian.Uint64(seed[8*i : 8*(i+1)])
+		for j := 0; j < 4; j++ {
+			state[i*4+j] = v
+		}
+	}
+
+	// Absorb the nonces, the SHAKE128 domain separator (0b1111), the
+	// start of the padding (0b...001) and the end of the padding 0b100...
+	// Recall that the rate of SHAKE128 is 168 --- i.e. 21 uint64s.
+	for j := 0; j < 4; j++ {
+		state[4*4+j] = uint64(nonces[j]) | (0x1f << 16)
+		state[20*4+j] = 0x80 << 56
+	}
+
+	var idx [4]int // indices into ps
+	for j := 0; j < 4; j++ {
+		if ps[j] == nil {
+			idx[j] = common.N // mark nil polynomial as completed
+		}
+	}
+
+	done := false
+	for !done {
+		// Applies KeccaK-f[1600] to state to get the next 21 uint64s of each
+		// of the four SHAKE128 streams.
+		perm.Permute()
+
+		done = true
+
+	PolyLoop:
+		for j := 0; j < 4; j++ {
+			if idx[j] == common.N {
+				continue
+			}
+			for i := 0; i < 7; i++ {
+				var t [8]uint32
+				t[0] = uint32(state[i*3*4+j] & 0x7fffff)
+				t[1] = uint32((state[i*3*4+j] >> 24) & 0x7fffff)
+				t[2] = uint32((state[i*3*4+j] >> 48) |
+					((state[(i*3+1)*4+j] & 0x7f) << 16))
+				t[3] = uint32((state[(i*3+1)*4+j] >> 8) & 0x7fffff)
+				t[4] = uint32((state[(i*3+1)*4+j] >> 32) & 0x7fffff)
+				t[5] = uint32((state[(i*3+1)*4+j] >> 56) |
+					((state[(i*3+2)*4+j] & 0x7fff) << 8))
+				t[6] = uint32((state[(i*3+2)*4+j] >> 16) & 0x7fffff)
+				t[7] = uint32((state[(i*3+2)*4+j] >> 40) & 0x7fffff)
+
+				for k := 0; k < 8; k++ {
+					if t[k] < common.Q {
+						ps[j][idx[j]] = t[k]
+						idx[j]++
+						if idx[j] == common.N {
+							continue PolyLoop
+						}
+					}
+				}
+			}
+			done = false
+		}
+	}
+}
+
+// Sample p uniformly from the given seed and nonce.
+//
+// p will be normalized.
+func PolyDeriveUniform(p *common.Poly, seed *[32]byte, nonce uint16) {
+	var i, length int
+	var buf [12 * 16]byte // fits 168B SHAKE-128 rate
+
+	length = 168
+
+	sample := func() {
+		// Note that 3 divides into 168 and 12*16, so we use up buf completely.
+		for j := 0; j < length && i < common.N; j += 3 {
+			t := (uint32(buf[j]) | (uint32(buf[j+1]) << 8) |
+				(uint32(buf[j+2]) << 16)) & 0x7fffff
+
+			// We use rejection sampling
+			if t < common.Q {
+				p[i] = t
+				i++
+			}
+		}
+	}
+
+	var iv [32 + 2]byte // 32 byte seed + uint16 nonce
+	h := sha3.NewShake128()
+	copy(iv[:32], seed[:])
+	iv[32] = uint8(nonce)
+	iv[33] = uint8(nonce >> 8)
+	_, _ = h.Write(iv[:])
+
+	for i < common.N {
+		_, _ = h.Read(buf[:168])
+		sample()
+	}
+}
+
+// Sample p uniformly with coefficients of norm less than or equal η,
+// using the given seed and nonce.
+//
+// p will not be normalized, but will have coefficients in [q-η,q+η].
+func PolyDeriveUniformLeqEta(p *common.Poly, seed *[64]byte, nonce uint16) {
+	// Assumes 2 < η < 8.
+	var i, length int
+	var buf [9 * 16]byte // fits 136B SHAKE-256 rate
+
+	length = 136
+
+	sample := func() {
+		// We use rejection sampling
+		for j := 0; j < length && i < common.N; j++ {
+			t1 := uint32(buf[j]) & 15
+			t2 := uint32(buf[j]) >> 4
+			if Eta == 2 { // branch is eliminated by compiler
+				if t1 <= 14 {
+					t1 -= ((205 * t1) >> 10) * 5 // reduce mod  5
+					p[i] = common.Q + Eta - t1
+					i++
+				}
+				if t2 <= 14 && i < common.N {
+					t2 -= ((205 * t2) >> 10) * 5 // reduce mod 5
+					p[i] = common.Q + Eta - t2
+					i++
+				}
+			} else if Eta == 4 {
+				if t1 <= 2*Eta {
+					p[i] = common.Q + Eta - t1
+					i++
+				}
+				if t2 <= 2*Eta && i < common.N {
+					p[i] = common.Q + Eta - t2
+					i++
+				}
+			} else {
+				panic("unsupported η")
+			}
+		}
+	}
+
+	var iv [64 + 2]byte // 64 byte seed + uint16 nonce
+
+	h := sha3.NewShake256()
+	copy(iv[:64], seed[:])
+	iv[64] = uint8(nonce)
+	iv[65] = uint8(nonce >> 8)
+
+	// 136 is SHAKE-256 rate
+	_, _ = h.Write(iv[:])
+
+	for i < common.N {
+		_, _ = h.Read(buf[:136])
+		sample()
+	}
+}
+
+// Sample v[i] uniformly with coefficients in (-γ₁,…,γ₁]  using the
+// given seed and nonce+i
+//
+// p will be normalized.
+func VecLDeriveUniformLeGamma1(v *VecL, seed *[64]byte, nonce uint16) {
+	for i := 0; i < L; i++ {
+		PolyDeriveUniformLeGamma1(&v[i], seed, nonce+uint16(i))
+	}
+}
+
+// Sample p uniformly with coefficients in (-γ₁,…,γK1s] using the
+// given seed and nonce.
+//
+// p will be normalized.
+func PolyDeriveUniformLeGamma1(p *common.Poly, seed *[64]byte, nonce uint16) {
+	var buf [PolyLeGamma1Size]byte
+
+	var iv [66]byte
+	h := sha3.NewShake256()
+	copy(iv[:64], seed[:])
+	iv[64] = uint8(nonce)
+	iv[65] = uint8(nonce >> 8)
+	_, _ = h.Write(iv[:])
+	_, _ = h.Read(buf[:])
+
+	PolyUnpackLeGamma1(p, buf[:])
+}
+
+// For each i, sample ps[i] uniformly with τ non-zero coefficients in {q-1,1}
+// using the given seed and w1[i].  ps[i] may be nil and is ignored
+// in that case.  ps[i] will be normalized.
+//
+// Can only be called when DeriveX4Available is true.
+//
+// This function is currently not used (yet).
+func PolyDeriveUniformBallX4(ps [4]*common.Poly, seed []byte) {
+	var perm keccakf1600.StateX4
+	state := perm.Initialize(false)
+
+	// Absorb the seed in the four states
+	for i := 0; i < CTildeSize/8; i++ {
+		v := binary.LittleEndian.Uint64(seed[8*i : 8*(i+1)])
+		for j := 0; j < 4; j++ {
+			state[i*4+j] = v
+		}
+	}
+
+	// SHAKE256 domain separator and padding
+	for j := 0; j < 4; j++ {
+		state[(CTildeSize/8)*4+j] ^= 0x1f
+		state[16*4+j] ^= 0x80 << 56
+	}
+	perm.Permute()
+
+	var signs [4]uint64
+	var idx [4]uint16 // indices into ps
+
+	for j := 0; j < 4; j++ {
+		if ps[j] != nil {
+			signs[j] = state[j]
+			*ps[j] = common.Poly{} // zero ps[j]
+			idx[j] = common.N - Tau
+		} else {
+			idx[j] = common.N // mark as completed
+		}
+	}
+
+	stateOffset := 1
+	for {
+		done := true
+
+	PolyLoop:
+		for j := 0; j < 4; j++ {
+			if idx[j] == common.N {
+				continue
+			}
+
+			for i := stateOffset; i < 17; i++ {
+				var bs [8]byte
+				binary.LittleEndian.PutUint64(bs[:], state[4*i+j])
+				for k := 0; k < 8; k++ {
+					b := uint16(bs[k])
+
+					if b > idx[j] {
+						continue
+					}
+
+					ps[j][idx[j]] = ps[j][b]
+					ps[j][b] = 1
+					// Takes least significant bit of signs and uses it for the sign.
+					// Note 1 ^ (1 | (Q-1)) = Q-1.
+					ps[j][b] ^= uint32((-(signs[j] & 1)) & (1 | (common.Q - 1)))
+					signs[j] >>= 1
+
+					idx[j]++
+					if idx[j] == common.N {
+						continue PolyLoop
+					}
+				}
+			}
+
+			done = false
+		}
+
+		if done {
+			break
+		}
+
+		perm.Permute()
+		stateOffset = 0
+	}
+}
+
+// Samples p uniformly with τ non-zero coefficients in {q-1,1}.
+//
+// The polynomial p will be normalized.
+func PolyDeriveUniformBall(p *common.Poly, seed []byte) {
+	var buf [136]byte // SHAKE-256 rate is 136
+
+	h := sha3.NewShake256()
+	_, _ = h.Write(seed[:])
+	_, _ = h.Read(buf[:])
+
+	// Essentially we generate a sequence of τ ones or minus ones,
+	// prepend 196 zeroes and shuffle the concatenation using the
+	// usual algorithm (Fisher--Yates.)
+	signs := binary.LittleEndian.Uint64(buf[:])
+	bufOff := 8 // offset into buf
+
+	*p = common.Poly{} // zero p
+	for i := uint16(common.N - Tau); i < common.N; i++ {
+		var b uint16
+
+		// Find location of where to move the new coefficient to using
+		// rejection sampling.
+		for {
+			if bufOff >= 136 {
+				_, _ = h.Read(buf[:])
+				bufOff = 0
+			}
+
+			b = uint16(buf[bufOff])
+			bufOff++
+
+			if b <= i {
+				break
+			}
+		}
+
+		p[i] = p[b]
+		p[b] = 1
+		// Takes least significant bit of signs and uses it for the sign.
+		// Note 1 ^ (1 | (Q-1)) = Q-1.
+		p[b] ^= uint32((-(signs & 1)) & (1 | (common.Q - 1)))
+		signs >>= 1
+	}
+}
diff --git a/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/vec.go b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/vec.go
new file mode 100644
index 0000000000..d07d3b2458
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/sign/mldsa/mldsa65/internal/vec.go
@@ -0,0 +1,281 @@
+// Code generated from mode3/internal/vec.go by gen.go
+
+package internal
+
+import (
+	common "github.com/cloudflare/circl/sign/internal/dilithium"
+)
+
+// A vector of L polynomials.
+type VecL [L]common.Poly
+
+// A vector of K polynomials.
+type VecK [K]common.Poly
+
+// Normalize the polynomials in this vector.
+func (v *VecL) Normalize() {
+	for i := 0; i < L; i++ {
+		v[i].Normalize()
+	}
+}
+
+// Normalize the polynomials in this vector assuming their coefficients
+// are already bounded by 2q.
+func (v *VecL) NormalizeAssumingLe2Q() {
+	for i := 0; i < L; i++ {
+		v[i].NormalizeAssumingLe2Q()
+	}
+}
+
+// Sets v to w + u.  Does not normalize.
+func (v *VecL) Add(w, u *VecL) {
+	for i := 0; i < L; i++ {
+		v[i].Add(&w[i], &u[i])
+	}
+}
+
+// Applies NTT componentwise. See Poly.NTT() for details.
+func (v *VecL) NTT() {
+	for i := 0; i < L; i++ {
+		v[i].NTT()
+	}
+}
+
+// Checks whether any of the coefficients exceeds the given bound in supnorm
+//
+// Requires the vector to be normalized.
+func (v *VecL) Exceeds(bound uint32) bool {
+	for i := 0; i < L; i++ {
+		if v[i].Exceeds(bound) {
+			return true
+		}
+	}
+	return false
+}
+
+// Applies Poly.Power2Round componentwise.
+//
+// Requires the vector to be normalized.
+func (v *VecL) Power2Round(v0PlusQ, v1 *VecL) {
+	for i := 0; i < L; i++ {
+		v[i].Power2Round(&v0PlusQ[i], &v1[i])
+	}
+}
+
+// Applies Poly.Decompose componentwise.
+//
+// Requires the vector to be normalized.
+func (v *VecL) Decompose(v0PlusQ, v1 *VecL) {
+	for i := 0; i < L; i++ {
+		PolyDecompose(&v[i], &v0PlusQ[i], &v1[i])
+	}
+}
+
+// Sequentially packs each polynomial using Poly.PackLeqEta().
+func (v *VecL) PackLeqEta(buf []byte) {
+	offset := 0
+	for i := 0; i < L; i++ {
+		PolyPackLeqEta(&v[i], buf[offset:])
+		offset += PolyLeqEtaSize
+	}
+}
+
+// Sets v to the polynomials packed in buf using VecL.PackLeqEta().
+func (v *VecL) UnpackLeqEta(buf []byte) {
+	offset := 0
+	for i := 0; i < L; i++ {
+		PolyUnpackLeqEta(&v[i], buf[offset:])
+		offset += PolyLeqEtaSize
+	}
+}
+
+// Sequentially packs each polynomial using PolyPackLeGamma1().
+func (v *VecL) PackLeGamma1(buf []byte) {
+	offset := 0
+	for i := 0; i < L; i++ {
+		PolyPackLeGamma1(&v[i], buf[offset:])
+		offset += PolyLeGamma1Size
+	}
+}
+
+// Sets v to the polynomials packed in buf using VecL.PackLeGamma1().
+func (v *VecL) UnpackLeGamma1(buf []byte) {
+	offset := 0
+	for i := 0; i < L; i++ {
+		PolyUnpackLeGamma1(&v[i], buf[offset:])
+		offset += PolyLeGamma1Size
+	}
+}
+
+// Normalize the polynomials in this vector.
+func (v *VecK) Normalize() {
+	for i := 0; i < K; i++ {
+		v[i].Normalize()
+	}
+}
+
+// Normalize the polynomials in this vector assuming their coefficients
+// are already bounded by 2q.
+func (v *VecK) NormalizeAssumingLe2Q() {
+	for i := 0; i < K; i++ {
+		v[i].NormalizeAssumingLe2Q()
+	}
+}
+
+// Sets v to w + u.  Does not normalize.
+func (v *VecK) Add(w, u *VecK) {
+	for i := 0; i < K; i++ {
+		v[i].Add(&w[i], &u[i])
+	}
+}
+
+// Checks whether any of the coefficients exceeds the given bound in supnorm
+//
+// Requires the vector to be normalized.
+func (v *VecK) Exceeds(bound uint32) bool {
+	for i := 0; i < K; i++ {
+		if v[i].Exceeds(bound) {
+			return true
+		}
+	}
+	return false
+}
+
+// Applies Poly.Power2Round componentwise.
+//
+// Requires the vector to be normalized.
+func (v *VecK) Power2Round(v0PlusQ, v1 *VecK) {
+	for i := 0; i < K; i++ {
+		v[i].Power2Round(&v0PlusQ[i], &v1[i])
+	}
+}
+
+// Applies Poly.Decompose componentwise.
+//
+// Requires the vector to be normalized.
+func (v *VecK) Decompose(v0PlusQ, v1 *VecK) {
+	for i := 0; i < K; i++ {
+		PolyDecompose(&v[i], &v0PlusQ[i], &v1[i])
+	}
+}
+
+// Sets v to the hint vector for v0 the modified low bits and v1
+// the unmodified high bits --- see makeHint().
+//
+// Returns the number of ones in the hint vector.
+func (v *VecK) MakeHint(v0, v1 *VecK) (pop uint32) {
+	for i := 0; i < K; i++ {
+		pop += PolyMakeHint(&v[i], &v0[i], &v1[i])
+	}
+	return
+}
+
+// Computes corrections to the high bits of the polynomials in the vector
+// w using the hints in h and sets v to the corrected high bits.  Returns v.
+// See useHint().
+func (v *VecK) UseHint(q, hint *VecK) *VecK {
+	for i := 0; i < K; i++ {
+		PolyUseHint(&v[i], &q[i], &hint[i])
+	}
+	return v
+}
+
+// Sequentially packs each polynomial using Poly.PackT1().
+func (v *VecK) PackT1(buf []byte) {
+	offset := 0
+	for i := 0; i < K; i++ {
+		v[i].PackT1(buf[offset:])
+		offset += common.PolyT1Size
+	}
+}
+
+// Sets v to the vector packed into buf by PackT1().
+func (v *VecK) UnpackT1(buf []byte) {
+	offset := 0
+	for i := 0; i < K; i++ {
+		v[i].UnpackT1(buf[offset:])
+		offset += common.PolyT1Size
+	}
+}
+
+// Sequentially packs each polynomial using Poly.PackT0().
+func (v *VecK) PackT0(buf []byte) {
+	offset := 0
+	for i := 0; i < K; i++ {
+		v[i].PackT0(buf[offset:])
+		offset += common.PolyT0Size
+	}
+}
+
+// Sets v to the vector packed into buf by PackT0().
+func (v *VecK) UnpackT0(buf []byte) {
+	offset := 0
+	for i := 0; i < K; i++ {
+		v[i].UnpackT0(buf[offset:])
+		offset += common.PolyT0Size
+	}
+}
+
+// Sequentially packs each polynomial using Poly.PackLeqEta().
+func (v *VecK) PackLeqEta(buf []byte) {
+	offset := 0
+	for i := 0; i < K; i++ {
+		PolyPackLeqEta(&v[i], buf[offset:])
+		offset += PolyLeqEtaSize
+	}
+}
+
+// Sets v to the polynomials packed in buf using VecK.PackLeqEta().
+func (v *VecK) UnpackLeqEta(buf []byte) {
+	offset := 0
+	for i := 0; i < K; i++ {
+		PolyUnpackLeqEta(&v[i], buf[offset:])
+		offset += PolyLeqEtaSize
+	}
+}
+
+// Applies NTT componentwise. See Poly.NTT() for details.
+func (v *VecK) NTT() {
+	for i := 0; i < K; i++ {
+		v[i].NTT()
+	}
+}
+
+// Sequentially packs each polynomial using PolyPackW1().
+func (v *VecK) PackW1(buf []byte) {
+	offset := 0
+	for i := 0; i < K; i++ {
+		PolyPackW1(&v[i], buf[offset:])
+		offset += PolyW1Size
+	}
+}
+
+// Sets v to a - b.
+//
+// Warning: assumes coefficients of the polynomials of  b are less than 2q.
+func (v *VecK) Sub(a, b *VecK) {
+	for i := 0; i < K; i++ {
+		v[i].Sub(&a[i], &b[i])
+	}
+}
+
+// Sets v to 2ᵈ w without reducing.
+func (v *VecK) MulBy2toD(w *VecK) {
+	for i := 0; i < K; i++ {
+		v[i].MulBy2toD(&w[i])
+	}
+}
+
+// Applies InvNTT componentwise. See Poly.InvNTT() for details.
+func (v *VecK) InvNTT() {
+	for i := 0; i < K; i++ {
+		v[i].InvNTT()
+	}
+}
+
+// Applies Poly.ReduceLe2Q() componentwise.
+func (v *VecK) ReduceLe2Q() {
+	for i := 0; i < K; i++ {
+		v[i].ReduceLe2Q()
+	}
+}
diff --git a/vendor/github.com/cloudflare/circl/sign/sign.go b/vendor/github.com/cloudflare/circl/sign/sign.go
index 13b20fa4b0..557d6f0960 100644
--- a/vendor/github.com/cloudflare/circl/sign/sign.go
+++ b/vendor/github.com/cloudflare/circl/sign/sign.go
@@ -107,4 +107,7 @@ var (
 	// ErrContextNotSupported is the error used if a context is not
 	// supported.
 	ErrContextNotSupported = errors.New("context not supported")
+
+	// ErrContextTooLong is the error used if the context string is too long.
+	ErrContextTooLong = errors.New("context string too long")
 )
diff --git a/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x.go b/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x.go
new file mode 100644
index 0000000000..20ac96f006
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x.go
@@ -0,0 +1,163 @@
+// Package keccakf1600 provides a two and four-way Keccak-f[1600] permutation in parallel.
+//
+// Keccak-f[1600] is the permutation underlying several algorithms such as
+// Keccak, SHA3 and SHAKE. Running two or four permutations in parallel is
+// useful in some scenarios like in hash-based signatures.
+//
+// # Limitations
+//
+// Note that not all the architectures support SIMD instructions. This package
+// uses AVX2 instructions that are available in some AMD64 architectures
+// and  NEON instructions that are available in some ARM64 architectures.
+//
+// For those systems not supporting these, the package still provides the
+// expected functionality by means of a generic and slow implementation.
+// The recommendation is to beforehand verify IsEnabledX4() and IsEnabledX2()
+// to determine if the current system supports the SIMD implementation.
+package keccakf1600
+
+import (
+	"runtime"
+	"unsafe"
+
+	"github.com/cloudflare/circl/internal/sha3"
+	"golang.org/x/sys/cpu"
+)
+
+// StateX4 contains state for the four-way permutation including the four
+// interleaved [25]uint64 buffers. Call Initialize() before use to initialize
+// and get a pointer to the interleaved buffer.
+type StateX4 struct {
+	// Go guarantees a to be aligned on 8 bytes, whereas we need it to be
+	// aligned on 32 bytes for bet performance.  Thus we leave some headroom
+	// to be able to move the start of the state.
+
+	// 4 x 25 uint64s for the interleaved states and three uint64s headroom
+	// to fix alignment.
+	a [103]uint64
+
+	// Offset into a that is 32 byte aligned.
+	offset int
+
+	// If true, permute will use 12-round keccak instead of 24-round keccak
+	turbo bool
+}
+
+// StateX2 contains state for the two-way permutation including the two
+// interleaved [25]uint64 buffers. Call Initialize() before use to initialize
+// and get a pointer to the interleaved buffer.
+type StateX2 struct {
+	// Go guarantees a to be aligned on 8 bytes, whereas we need it to be
+	// aligned on 32 bytes for bet performance.  Thus we leave some headroom
+	// to be able to move the start of the state.
+
+	// 2 x 25 uint64s for the interleaved states and three uint64s headroom
+	// to fix alignment.
+	a [53]uint64
+
+	// Offset into a that is 32 byte aligned.
+	offset int
+
+	// If true, permute will use 12-round keccak instead of 24-round keccak
+	turbo bool
+}
+
+// IsEnabledX4 returns true if the architecture supports a four-way SIMD
+// implementation provided in this package.
+func IsEnabledX4() bool { return cpu.X86.HasAVX2 }
+
+// IsEnabledX2 returns true if the architecture supports a two-way SIMD
+// implementation provided in this package.
+func IsEnabledX2() bool { return enabledX2 }
+
+// Initialize the state and returns the buffer on which the four permutations
+// will act: a uint64 slice of length 100.  The first permutation will act
+// on {a[0], a[4], ..., a[96]}, the second on {a[1], a[5], ..., a[97]}, etc.
+// If turbo is true, applies 12-round variant instead of the usual 24.
+func (s *StateX4) Initialize(turbo bool) []uint64 {
+	s.turbo = turbo
+	rp := unsafe.Pointer(&s.a[0])
+
+	// uint64s are always aligned by a multiple of 8.  Compute the remainder
+	// of the address modulo 32 divided by 8.
+	rem := (int(uintptr(rp)&31) >> 3)
+
+	if rem != 0 {
+		s.offset = 4 - rem
+	}
+
+	// The slice we return will be aligned on 32 byte boundary.
+	return s.a[s.offset : s.offset+100]
+}
+
+// Initialize the state and returns the buffer on which the two permutations
+// will act: a uint64 slice of length 50.  The first permutation will act
+// on {a[0], a[2], ..., a[48]} and the second on {a[1], a[3], ..., a[49]}.
+// If turbo is true, applies 12-round variant instead of the usual 24.
+func (s *StateX2) Initialize(turbo bool) []uint64 {
+	s.turbo = turbo
+	rp := unsafe.Pointer(&s.a[0])
+
+	// uint64s are always aligned by a multiple of 8.  Compute the remainder
+	// of the address modulo 32 divided by 8.
+	rem := (int(uintptr(rp)&31) >> 3)
+
+	if rem != 0 {
+		s.offset = 4 - rem
+	}
+
+	// The slice we return will be aligned on 32 byte boundary.
+	return s.a[s.offset : s.offset+50]
+}
+
+// Permute performs the four parallel Keccak-f[1600]s interleaved on the slice
+// returned from Initialize().
+func (s *StateX4) Permute() {
+	if IsEnabledX4() {
+		permuteSIMDx4(s.a[s.offset:], s.turbo)
+	} else {
+		permuteScalarX4(s.a[s.offset:], s.turbo) // A slower generic implementation.
+	}
+}
+
+// Permute performs the two parallel Keccak-f[1600]s interleaved on the slice
+// returned from Initialize().
+func (s *StateX2) Permute() {
+	if IsEnabledX2() {
+		permuteSIMDx2(s.a[s.offset:], s.turbo)
+	} else {
+		permuteScalarX2(s.a[s.offset:], s.turbo) // A slower generic implementation.
+	}
+}
+
+func permuteScalarX4(a []uint64, turbo bool) {
+	var buf [25]uint64
+	for i := 0; i < 4; i++ {
+		for j := 0; j < 25; j++ {
+			buf[j] = a[4*j+i]
+		}
+		sha3.KeccakF1600(&buf, turbo)
+		for j := 0; j < 25; j++ {
+			a[4*j+i] = buf[j]
+		}
+	}
+}
+
+func permuteScalarX2(a []uint64, turbo bool) {
+	var buf [25]uint64
+	for i := 0; i < 2; i++ {
+		for j := 0; j < 25; j++ {
+			buf[j] = a[2*j+i]
+		}
+		sha3.KeccakF1600(&buf, turbo)
+		for j := 0; j < 25; j++ {
+			a[2*j+i] = buf[j]
+		}
+	}
+}
+
+var enabledX2 bool
+
+func init() {
+	enabledX2 = runtime.GOARCH == "arm64" && runtime.GOOS == "darwin"
+}
diff --git a/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x2_arm64.go b/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x2_arm64.go
new file mode 100644
index 0000000000..0cb9692c32
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x2_arm64.go
@@ -0,0 +1,13 @@
+//go:build arm64 && go1.16 && !purego
+// +build arm64,go1.16,!purego
+
+package keccakf1600
+
+import "github.com/cloudflare/circl/internal/sha3"
+
+func permuteSIMDx2(state []uint64, turbo bool) { f1600x2ARM(&state[0], &sha3.RC, turbo) }
+
+func permuteSIMDx4(state []uint64, turbo bool) { permuteScalarX4(state, turbo) }
+
+//go:noescape
+func f1600x2ARM(state *uint64, rc *[24]uint64, turbo bool)
diff --git a/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x2_arm64.s b/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x2_arm64.s
new file mode 100644
index 0000000000..998aeca5b4
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x2_arm64.s
@@ -0,0 +1,136 @@
+// +build arm64,go1.16,!purego
+
+// Taken from https://github.com/bwesterb/armed-keccak
+
+#include "textflag.h"
+
+// func f1600x2ARM(state *uint64, rc *[24]uint64, turbo bool)
+TEXT ·f1600x2ARM(SB), NOSPLIT, $0-17
+    MOVD state+0(FP), R0
+    MOVD rc+8(FP), R1
+    MOVD R0, R2
+    MOVD $24, R3
+
+    VLD1.P 64(R0), [ V0.B16,  V1.B16,  V2.B16,  V3.B16]
+    VLD1.P 64(R0), [ V4.B16,  V5.B16,  V6.B16,  V7.B16]
+    VLD1.P 64(R0), [ V8.B16,  V9.B16, V10.B16, V11.B16]
+    VLD1.P 64(R0), [V12.B16, V13.B16, V14.B16, V15.B16]
+    VLD1.P 64(R0), [V16.B16, V17.B16, V18.B16, V19.B16]
+    VLD1.P 64(R0), [V20.B16, V21.B16, V22.B16, V23.B16]
+    VLD1.P (R0),   [V24.B16]
+
+    MOVBU turbo+16(FP), R4
+    CBZ R4, loop
+
+    SUB  $12, R3, R3
+    ADD  $96, R1, R1
+
+loop:
+    // Execute theta but without xorring into the state yet.
+    VEOR3 V10.B16, V5.B16, V0.B16, V25.B16
+    VEOR3 V11.B16, V6.B16, V1.B16, V26.B16
+    VEOR3 V12.B16, V7.B16, V2.B16, V27.B16
+    VEOR3 V13.B16, V8.B16, V3.B16, V28.B16
+    VEOR3 V14.B16, V9.B16, V4.B16, V29.B16
+
+    VEOR3 V20.B16, V15.B16, V25.B16, V25.B16
+    VEOR3 V21.B16, V16.B16, V26.B16, V26.B16
+    VEOR3 V22.B16, V17.B16, V27.B16, V27.B16
+    VEOR3 V23.B16, V18.B16, V28.B16, V28.B16
+    VEOR3 V24.B16, V19.B16, V29.B16, V29.B16
+
+    // Xor parities from step theta into the state at the same time as
+    // exeuting rho and pi.   
+    VRAX1 V26.D2, V29.D2, V30.D2
+    VRAX1 V29.D2, V27.D2, V29.D2
+    VRAX1 V27.D2, V25.D2, V27.D2
+    VRAX1 V25.D2, V28.D2, V25.D2
+    VRAX1 V28.D2, V26.D2, V28.D2
+
+    VEOR V30.B16, V0.B16, V0.B16
+    VMOV V1.B16, V31.B16
+
+    VXAR $20, V27.D2,  V6.D2,  V1.D2   
+    VXAR $44, V25.D2,  V9.D2,  V6.D2   
+    VXAR $3 , V28.D2, V22.D2,  V9.D2   
+    VXAR $25, V25.D2, V14.D2, V22.D2  
+    VXAR $46, V30.D2, V20.D2, V14.D2  
+    VXAR $2 , V28.D2,  V2.D2, V20.D2  
+    VXAR $21, V28.D2, V12.D2,  V2.D2  
+    VXAR $39, V29.D2, V13.D2, V12.D2  
+    VXAR $56, V25.D2, V19.D2, V13.D2  
+    VXAR $8 , V29.D2, V23.D2, V19.D2  
+    VXAR $23, V30.D2, V15.D2, V23.D2  
+    VXAR $37, V25.D2,  V4.D2, V15.D2  
+    VXAR $50, V25.D2, V24.D2,  V4.D2   
+    VXAR $62, V27.D2, V21.D2, V24.D2  
+    VXAR $9 , V29.D2,  V8.D2, V21.D2  
+    VXAR $19, V27.D2, V16.D2,  V8.D2   
+    VXAR $28, V30.D2,  V5.D2, V16.D2  
+    VXAR $36, V29.D2,  V3.D2,  V5.D2   
+    VXAR $43, V29.D2, V18.D2,  V3.D2   
+    VXAR $49, V28.D2, V17.D2, V18.D2  
+    VXAR $54, V27.D2, V11.D2, V17.D2  
+    VXAR $58, V28.D2,  V7.D2, V11.D2  
+    VXAR $61, V30.D2, V10.D2,  V7.D2   
+    VXAR $63, V27.D2, V31.D2, V10.D2  
+
+    // Chi
+    VBCAX V1.B16, V2.B16, V0.B16, V25.B16
+    VBCAX V2.B16, V3.B16, V1.B16, V26.B16
+    VBCAX V3.B16, V4.B16, V2.B16,  V2.B16
+    VBCAX V4.B16, V0.B16, V3.B16,  V3.B16
+    VBCAX V0.B16, V1.B16, V4.B16,  V4.B16
+    VMOV V25.B16, V0.B16
+    VMOV V26.B16, V1.B16
+
+    VBCAX V6.B16, V7.B16, V5.B16, V25.B16
+    VBCAX V7.B16, V8.B16, V6.B16, V26.B16
+    VBCAX V8.B16, V9.B16, V7.B16,  V7.B16
+    VBCAX V9.B16, V5.B16, V8.B16,  V8.B16
+    VBCAX V5.B16, V6.B16, V9.B16,  V9.B16
+    VMOV V25.B16, V5.B16
+    VMOV V26.B16, V6.B16
+
+    VBCAX V11.B16, V12.B16, V10.B16, V25.B16
+    VBCAX V12.B16, V13.B16, V11.B16, V26.B16
+    VBCAX V13.B16, V14.B16, V12.B16, V12.B16
+    VBCAX V14.B16, V10.B16, V13.B16, V13.B16
+    VBCAX V10.B16, V11.B16, V14.B16, V14.B16
+    VMOV V25.B16, V10.B16
+    VMOV V26.B16, V11.B16
+
+    VBCAX V16.B16, V17.B16, V15.B16, V25.B16
+    VBCAX V17.B16, V18.B16, V16.B16, V26.B16
+    VBCAX V18.B16, V19.B16, V17.B16, V17.B16
+    VBCAX V19.B16, V15.B16, V18.B16, V18.B16
+    VBCAX V15.B16, V16.B16, V19.B16, V19.B16
+    VMOV V25.B16, V15.B16
+    VMOV V26.B16, V16.B16
+
+    VBCAX V21.B16, V22.B16, V20.B16, V25.B16
+    VBCAX V22.B16, V23.B16, V21.B16, V26.B16
+    VBCAX V23.B16, V24.B16, V22.B16, V22.B16
+    VBCAX V24.B16, V20.B16, V23.B16, V23.B16
+    VBCAX V20.B16, V21.B16, V24.B16, V24.B16
+    VMOV V25.B16, V20.B16
+    VMOV V26.B16, V21.B16
+
+    // Iota
+    VLD1R.P 8(R1), [V25.D2]
+    VEOR V25.B16, V0.B16, V0.B16
+
+    SUBS $1, R3, R3
+    CBNZ R3, loop
+
+    MOVD R2, R0
+
+    VST1.P [ V0.B16,  V1.B16,  V2.B16,  V3.B16], 64(R0) 
+    VST1.P [ V4.B16,  V5.B16,  V6.B16,  V7.B16], 64(R0)
+    VST1.P [ V8.B16,  V9.B16, V10.B16, V11.B16], 64(R0)
+    VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R0)
+    VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R0)
+    VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R0)
+    VST1.P [V24.B16], (R0)
+
+    RET
diff --git a/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x4_amd64.go b/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x4_amd64.go
new file mode 100644
index 0000000000..bf5b865d0b
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x4_amd64.go
@@ -0,0 +1,10 @@
+//go:build amd64 && !purego
+// +build amd64,!purego
+
+package keccakf1600
+
+import "github.com/cloudflare/circl/internal/sha3"
+
+func permuteSIMDx4(state []uint64, turbo bool) { f1600x4AVX2(&state[0], &sha3.RC, turbo) }
+
+func permuteSIMDx2(state []uint64, turbo bool) { permuteScalarX2(state, turbo) }
diff --git a/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x4_amd64.s b/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x4_amd64.s
new file mode 100644
index 0000000000..67b64550c2
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x4_amd64.s
@@ -0,0 +1,899 @@
+// Code generated by command: go run src.go -out ../../f1600x4_amd64.s -stubs ../../f1600x4stubs_amd64.go -pkg keccakf1600. DO NOT EDIT.
+
+//go:build amd64 && !purego
+
+#include "textflag.h"
+
+// func f1600x4AVX2(state *uint64, rc *[24]uint64, turbo bool)
+// Requires: AVX, AVX2
+TEXT ·f1600x4AVX2(SB), NOSPLIT, $0-17
+	MOVQ    state+0(FP), AX
+	MOVQ    rc+8(FP), CX
+	MOVQ    $0x0000000000000006, DX
+	MOVBQZX turbo+16(FP), BX
+	TESTQ   BX, BX
+	JZ      loop
+	MOVQ    $0x0000000000000003, DX
+	ADDQ    $0x60, CX
+
+loop:
+	VMOVDQA      (AX), Y0
+	VMOVDQA      32(AX), Y1
+	VMOVDQA      64(AX), Y2
+	VMOVDQA      96(AX), Y3
+	VMOVDQA      128(AX), Y4
+	VPXOR        160(AX), Y0, Y0
+	VPXOR        192(AX), Y1, Y1
+	VPXOR        224(AX), Y2, Y2
+	VPXOR        256(AX), Y3, Y3
+	VPXOR        288(AX), Y4, Y4
+	VPXOR        320(AX), Y0, Y0
+	VPXOR        352(AX), Y1, Y1
+	VPXOR        384(AX), Y2, Y2
+	VPXOR        416(AX), Y3, Y3
+	VPXOR        448(AX), Y4, Y4
+	VPXOR        480(AX), Y0, Y0
+	VPXOR        512(AX), Y1, Y1
+	VPXOR        544(AX), Y2, Y2
+	VPXOR        576(AX), Y3, Y3
+	VPXOR        608(AX), Y4, Y4
+	VPXOR        640(AX), Y0, Y0
+	VPXOR        672(AX), Y1, Y1
+	VPXOR        704(AX), Y2, Y2
+	VPXOR        736(AX), Y3, Y3
+	VPXOR        768(AX), Y4, Y4
+	VPSLLQ       $0x01, Y1, Y5
+	VPSLLQ       $0x01, Y2, Y6
+	VPSLLQ       $0x01, Y3, Y7
+	VPSLLQ       $0x01, Y4, Y8
+	VPSLLQ       $0x01, Y0, Y9
+	VPSRLQ       $0x3f, Y1, Y10
+	VPSRLQ       $0x3f, Y2, Y11
+	VPSRLQ       $0x3f, Y3, Y12
+	VPSRLQ       $0x3f, Y4, Y13
+	VPSRLQ       $0x3f, Y0, Y14
+	VPOR         Y5, Y10, Y10
+	VPOR         Y6, Y11, Y11
+	VPOR         Y7, Y12, Y12
+	VPOR         Y8, Y13, Y13
+	VPOR         Y9, Y14, Y14
+	VPXOR        Y10, Y4, Y10
+	VPXOR        Y11, Y0, Y11
+	VPXOR        Y12, Y1, Y12
+	VPXOR        Y13, Y2, Y13
+	VPXOR        Y14, Y3, Y14
+	VPXOR        (AX), Y10, Y0
+	VPXOR        192(AX), Y11, Y1
+	VPXOR        384(AX), Y12, Y2
+	VPXOR        576(AX), Y13, Y3
+	VPXOR        768(AX), Y14, Y4
+	VPSLLQ       $0x2c, Y1, Y6
+	VPSLLQ       $0x2b, Y2, Y7
+	VPSLLQ       $0x15, Y3, Y8
+	VPSLLQ       $0x0e, Y4, Y9
+	VPSRLQ       $0x14, Y1, Y1
+	VPSRLQ       $0x15, Y2, Y2
+	VPSRLQ       $0x2b, Y3, Y3
+	VPSRLQ       $0x32, Y4, Y4
+	VPOR         Y6, Y1, Y1
+	VPOR         Y7, Y2, Y2
+	VPOR         Y8, Y3, Y3
+	VPOR         Y9, Y4, Y4
+	VPANDN       Y2, Y1, Y5
+	VPANDN       Y3, Y2, Y6
+	VPANDN       Y4, Y3, Y7
+	VPANDN       Y0, Y4, Y8
+	VPANDN       Y1, Y0, Y9
+	VPXOR        Y0, Y5, Y5
+	VPXOR        Y1, Y6, Y6
+	VPXOR        Y2, Y7, Y7
+	VPXOR        Y3, Y8, Y8
+	VPXOR        Y4, Y9, Y9
+	VPBROADCASTQ (CX), Y0
+	VPXOR        Y0, Y5, Y5
+	VMOVDQA      Y5, (AX)
+	VMOVDQA      Y6, 192(AX)
+	VMOVDQA      Y7, 384(AX)
+	VMOVDQA      Y8, 576(AX)
+	VMOVDQA      Y9, 768(AX)
+	VPXOR        96(AX), Y13, Y0
+	VPXOR        288(AX), Y14, Y1
+	VPXOR        320(AX), Y10, Y2
+	VPXOR        512(AX), Y11, Y3
+	VPXOR        704(AX), Y12, Y4
+	VPSLLQ       $0x1c, Y0, Y5
+	VPSLLQ       $0x14, Y1, Y6
+	VPSLLQ       $0x03, Y2, Y7
+	VPSLLQ       $0x2d, Y3, Y8
+	VPSLLQ       $0x3d, Y4, Y9
+	VPSRLQ       $0x24, Y0, Y0
+	VPSRLQ       $0x2c, Y1, Y1
+	VPSRLQ       $0x3d, Y2, Y2
+	VPSRLQ       $0x13, Y3, Y3
+	VPSRLQ       $0x03, Y4, Y4
+	VPOR         Y5, Y0, Y0
+	VPOR         Y6, Y1, Y1
+	VPOR         Y7, Y2, Y2
+	VPOR         Y8, Y3, Y3
+	VPOR         Y9, Y4, Y4
+	VPANDN       Y2, Y1, Y5
+	VPANDN       Y3, Y2, Y6
+	VPANDN       Y4, Y3, Y7
+	VPANDN       Y0, Y4, Y8
+	VPANDN       Y1, Y0, Y9
+	VPXOR        Y0, Y5, Y5
+	VPXOR        Y1, Y6, Y6
+	VPXOR        Y2, Y7, Y7
+	VPXOR        Y3, Y8, Y8
+	VPXOR        Y4, Y9, Y9
+	VMOVDQA      Y5, 320(AX)
+	VMOVDQA      Y6, 512(AX)
+	VMOVDQA      Y7, 704(AX)
+	VMOVDQA      Y8, 96(AX)
+	VMOVDQA      Y9, 288(AX)
+	VPXOR        32(AX), Y11, Y0
+	VPXOR        224(AX), Y12, Y1
+	VPXOR        416(AX), Y13, Y2
+	VPXOR        608(AX), Y14, Y3
+	VPXOR        640(AX), Y10, Y4
+	VPSLLQ       $0x01, Y0, Y5
+	VPSLLQ       $0x06, Y1, Y6
+	VPSLLQ       $0x19, Y2, Y7
+	VPSLLQ       $0x08, Y3, Y8
+	VPSLLQ       $0x12, Y4, Y9
+	VPSRLQ       $0x3f, Y0, Y0
+	VPSRLQ       $0x3a, Y1, Y1
+	VPSRLQ       $0x27, Y2, Y2
+	VPSRLQ       $0x38, Y3, Y3
+	VPSRLQ       $0x2e, Y4, Y4
+	VPOR         Y5, Y0, Y0
+	VPOR         Y6, Y1, Y1
+	VPOR         Y7, Y2, Y2
+	VPOR         Y8, Y3, Y3
+	VPOR         Y9, Y4, Y4
+	VPANDN       Y2, Y1, Y5
+	VPANDN       Y3, Y2, Y6
+	VPANDN       Y4, Y3, Y7
+	VPANDN       Y0, Y4, Y8
+	VPANDN       Y1, Y0, Y9
+	VPXOR        Y0, Y5, Y5
+	VPXOR        Y1, Y6, Y6
+	VPXOR        Y2, Y7, Y7
+	VPXOR        Y3, Y8, Y8
+	VPXOR        Y4, Y9, Y9
+	VMOVDQA      Y5, 640(AX)
+	VMOVDQA      Y6, 32(AX)
+	VMOVDQA      Y7, 224(AX)
+	VMOVDQA      Y8, 416(AX)
+	VMOVDQA      Y9, 608(AX)
+	VPXOR        128(AX), Y14, Y0
+	VPXOR        160(AX), Y10, Y1
+	VPXOR        352(AX), Y11, Y2
+	VPXOR        544(AX), Y12, Y3
+	VPXOR        736(AX), Y13, Y4
+	VPSLLQ       $0x1b, Y0, Y5
+	VPSLLQ       $0x24, Y1, Y6
+	VPSLLQ       $0x0a, Y2, Y7
+	VPSLLQ       $0x0f, Y3, Y8
+	VPSLLQ       $0x38, Y4, Y9
+	VPSRLQ       $0x25, Y0, Y0
+	VPSRLQ       $0x1c, Y1, Y1
+	VPSRLQ       $0x36, Y2, Y2
+	VPSRLQ       $0x31, Y3, Y3
+	VPSRLQ       $0x08, Y4, Y4
+	VPOR         Y5, Y0, Y0
+	VPOR         Y6, Y1, Y1
+	VPOR         Y7, Y2, Y2
+	VPOR         Y8, Y3, Y3
+	VPOR         Y9, Y4, Y4
+	VPANDN       Y2, Y1, Y5
+	VPANDN       Y3, Y2, Y6
+	VPANDN       Y4, Y3, Y7
+	VPANDN       Y0, Y4, Y8
+	VPANDN       Y1, Y0, Y9
+	VPXOR        Y0, Y5, Y5
+	VPXOR        Y1, Y6, Y6
+	VPXOR        Y2, Y7, Y7
+	VPXOR        Y3, Y8, Y8
+	VPXOR        Y4, Y9, Y9
+	VMOVDQA      Y5, 160(AX)
+	VMOVDQA      Y6, 352(AX)
+	VMOVDQA      Y7, 544(AX)
+	VMOVDQA      Y8, 736(AX)
+	VMOVDQA      Y9, 128(AX)
+	VPXOR        64(AX), Y12, Y0
+	VPXOR        256(AX), Y13, Y1
+	VPXOR        448(AX), Y14, Y2
+	VPXOR        480(AX), Y10, Y3
+	VPXOR        672(AX), Y11, Y4
+	VPSLLQ       $0x3e, Y0, Y5
+	VPSLLQ       $0x37, Y1, Y6
+	VPSLLQ       $0x27, Y2, Y7
+	VPSLLQ       $0x29, Y3, Y8
+	VPSLLQ       $0x02, Y4, Y9
+	VPSRLQ       $0x02, Y0, Y0
+	VPSRLQ       $0x09, Y1, Y1
+	VPSRLQ       $0x19, Y2, Y2
+	VPSRLQ       $0x17, Y3, Y3
+	VPSRLQ       $0x3e, Y4, Y4
+	VPOR         Y5, Y0, Y0
+	VPOR         Y6, Y1, Y1
+	VPOR         Y7, Y2, Y2
+	VPOR         Y8, Y3, Y3
+	VPOR         Y9, Y4, Y4
+	VPANDN       Y2, Y1, Y5
+	VPANDN       Y3, Y2, Y6
+	VPANDN       Y4, Y3, Y7
+	VPANDN       Y0, Y4, Y8
+	VPANDN       Y1, Y0, Y9
+	VPXOR        Y0, Y5, Y5
+	VPXOR        Y1, Y6, Y6
+	VPXOR        Y2, Y7, Y7
+	VPXOR        Y3, Y8, Y8
+	VPXOR        Y4, Y9, Y9
+	VMOVDQA      Y5, 480(AX)
+	VMOVDQA      Y6, 672(AX)
+	VMOVDQA      Y7, 64(AX)
+	VMOVDQA      Y8, 256(AX)
+	VMOVDQA      Y9, 448(AX)
+	VMOVDQA      (AX), Y0
+	VMOVDQA      32(AX), Y1
+	VMOVDQA      64(AX), Y2
+	VMOVDQA      96(AX), Y3
+	VMOVDQA      128(AX), Y4
+	VPXOR        160(AX), Y0, Y0
+	VPXOR        192(AX), Y1, Y1
+	VPXOR        224(AX), Y2, Y2
+	VPXOR        256(AX), Y3, Y3
+	VPXOR        288(AX), Y4, Y4
+	VPXOR        320(AX), Y0, Y0
+	VPXOR        352(AX), Y1, Y1
+	VPXOR        384(AX), Y2, Y2
+	VPXOR        416(AX), Y3, Y3
+	VPXOR        448(AX), Y4, Y4
+	VPXOR        480(AX), Y0, Y0
+	VPXOR        512(AX), Y1, Y1
+	VPXOR        544(AX), Y2, Y2
+	VPXOR        576(AX), Y3, Y3
+	VPXOR        608(AX), Y4, Y4
+	VPXOR        640(AX), Y0, Y0
+	VPXOR        672(AX), Y1, Y1
+	VPXOR        704(AX), Y2, Y2
+	VPXOR        736(AX), Y3, Y3
+	VPXOR        768(AX), Y4, Y4
+	VPSLLQ       $0x01, Y1, Y5
+	VPSLLQ       $0x01, Y2, Y6
+	VPSLLQ       $0x01, Y3, Y7
+	VPSLLQ       $0x01, Y4, Y8
+	VPSLLQ       $0x01, Y0, Y9
+	VPSRLQ       $0x3f, Y1, Y10
+	VPSRLQ       $0x3f, Y2, Y11
+	VPSRLQ       $0x3f, Y3, Y12
+	VPSRLQ       $0x3f, Y4, Y13
+	VPSRLQ       $0x3f, Y0, Y14
+	VPOR         Y5, Y10, Y10
+	VPOR         Y6, Y11, Y11
+	VPOR         Y7, Y12, Y12
+	VPOR         Y8, Y13, Y13
+	VPOR         Y9, Y14, Y14
+	VPXOR        Y10, Y4, Y10
+	VPXOR        Y11, Y0, Y11
+	VPXOR        Y12, Y1, Y12
+	VPXOR        Y13, Y2, Y13
+	VPXOR        Y14, Y3, Y14
+	VPXOR        (AX), Y10, Y0
+	VPXOR        512(AX), Y11, Y1
+	VPXOR        224(AX), Y12, Y2
+	VPXOR        736(AX), Y13, Y3
+	VPXOR        448(AX), Y14, Y4
+	VPSLLQ       $0x2c, Y1, Y6
+	VPSLLQ       $0x2b, Y2, Y7
+	VPSLLQ       $0x15, Y3, Y8
+	VPSLLQ       $0x0e, Y4, Y9
+	VPSRLQ       $0x14, Y1, Y1
+	VPSRLQ       $0x15, Y2, Y2
+	VPSRLQ       $0x2b, Y3, Y3
+	VPSRLQ       $0x32, Y4, Y4
+	VPOR         Y6, Y1, Y1
+	VPOR         Y7, Y2, Y2
+	VPOR         Y8, Y3, Y3
+	VPOR         Y9, Y4, Y4
+	VPANDN       Y2, Y1, Y5
+	VPANDN       Y3, Y2, Y6
+	VPANDN       Y4, Y3, Y7
+	VPANDN       Y0, Y4, Y8
+	VPANDN       Y1, Y0, Y9
+	VPXOR        Y0, Y5, Y5
+	VPXOR        Y1, Y6, Y6
+	VPXOR        Y2, Y7, Y7
+	VPXOR        Y3, Y8, Y8
+	VPXOR        Y4, Y9, Y9
+	VPBROADCASTQ 8(CX), Y0
+	VPXOR        Y0, Y5, Y5
+	VMOVDQA      Y5, (AX)
+	VMOVDQA      Y6, 512(AX)
+	VMOVDQA      Y7, 224(AX)
+	VMOVDQA      Y8, 736(AX)
+	VMOVDQA      Y9, 448(AX)
+	VPXOR        576(AX), Y13, Y0
+	VPXOR        288(AX), Y14, Y1
+	VPXOR        640(AX), Y10, Y2
+	VPXOR        352(AX), Y11, Y3
+	VPXOR        64(AX), Y12, Y4
+	VPSLLQ       $0x1c, Y0, Y5
+	VPSLLQ       $0x14, Y1, Y6
+	VPSLLQ       $0x03, Y2, Y7
+	VPSLLQ       $0x2d, Y3, Y8
+	VPSLLQ       $0x3d, Y4, Y9
+	VPSRLQ       $0x24, Y0, Y0
+	VPSRLQ       $0x2c, Y1, Y1
+	VPSRLQ       $0x3d, Y2, Y2
+	VPSRLQ       $0x13, Y3, Y3
+	VPSRLQ       $0x03, Y4, Y4
+	VPOR         Y5, Y0, Y0
+	VPOR         Y6, Y1, Y1
+	VPOR         Y7, Y2, Y2
+	VPOR         Y8, Y3, Y3
+	VPOR         Y9, Y4, Y4
+	VPANDN       Y2, Y1, Y5
+	VPANDN       Y3, Y2, Y6
+	VPANDN       Y4, Y3, Y7
+	VPANDN       Y0, Y4, Y8
+	VPANDN       Y1, Y0, Y9
+	VPXOR        Y0, Y5, Y5
+	VPXOR        Y1, Y6, Y6
+	VPXOR        Y2, Y7, Y7
+	VPXOR        Y3, Y8, Y8
+	VPXOR        Y4, Y9, Y9
+	VMOVDQA      Y5, 640(AX)
+	VMOVDQA      Y6, 352(AX)
+	VMOVDQA      Y7, 64(AX)
+	VMOVDQA      Y8, 576(AX)
+	VMOVDQA      Y9, 288(AX)
+	VPXOR        192(AX), Y11, Y0
+	VPXOR        704(AX), Y12, Y1
+	VPXOR        416(AX), Y13, Y2
+	VPXOR        128(AX), Y14, Y3
+	VPXOR        480(AX), Y10, Y4
+	VPSLLQ       $0x01, Y0, Y5
+	VPSLLQ       $0x06, Y1, Y6
+	VPSLLQ       $0x19, Y2, Y7
+	VPSLLQ       $0x08, Y3, Y8
+	VPSLLQ       $0x12, Y4, Y9
+	VPSRLQ       $0x3f, Y0, Y0
+	VPSRLQ       $0x3a, Y1, Y1
+	VPSRLQ       $0x27, Y2, Y2
+	VPSRLQ       $0x38, Y3, Y3
+	VPSRLQ       $0x2e, Y4, Y4
+	VPOR         Y5, Y0, Y0
+	VPOR         Y6, Y1, Y1
+	VPOR         Y7, Y2, Y2
+	VPOR         Y8, Y3, Y3
+	VPOR         Y9, Y4, Y4
+	VPANDN       Y2, Y1, Y5
+	VPANDN       Y3, Y2, Y6
+	VPANDN       Y4, Y3, Y7
+	VPANDN       Y0, Y4, Y8
+	VPANDN       Y1, Y0, Y9
+	VPXOR        Y0, Y5, Y5
+	VPXOR        Y1, Y6, Y6
+	VPXOR        Y2, Y7, Y7
+	VPXOR        Y3, Y8, Y8
+	VPXOR        Y4, Y9, Y9
+	VMOVDQA      Y5, 480(AX)
+	VMOVDQA      Y6, 192(AX)
+	VMOVDQA      Y7, 704(AX)
+	VMOVDQA      Y8, 416(AX)
+	VMOVDQA      Y9, 128(AX)
+	VPXOR        768(AX), Y14, Y0
+	VPXOR        320(AX), Y10, Y1
+	VPXOR        32(AX), Y11, Y2
+	VPXOR        544(AX), Y12, Y3
+	VPXOR        256(AX), Y13, Y4
+	VPSLLQ       $0x1b, Y0, Y5
+	VPSLLQ       $0x24, Y1, Y6
+	VPSLLQ       $0x0a, Y2, Y7
+	VPSLLQ       $0x0f, Y3, Y8
+	VPSLLQ       $0x38, Y4, Y9
+	VPSRLQ       $0x25, Y0, Y0
+	VPSRLQ       $0x1c, Y1, Y1
+	VPSRLQ       $0x36, Y2, Y2
+	VPSRLQ       $0x31, Y3, Y3
+	VPSRLQ       $0x08, Y4, Y4
+	VPOR         Y5, Y0, Y0
+	VPOR         Y6, Y1, Y1
+	VPOR         Y7, Y2, Y2
+	VPOR         Y8, Y3, Y3
+	VPOR         Y9, Y4, Y4
+	VPANDN       Y2, Y1, Y5
+	VPANDN       Y3, Y2, Y6
+	VPANDN       Y4, Y3, Y7
+	VPANDN       Y0, Y4, Y8
+	VPANDN       Y1, Y0, Y9
+	VPXOR        Y0, Y5, Y5
+	VPXOR        Y1, Y6, Y6
+	VPXOR        Y2, Y7, Y7
+	VPXOR        Y3, Y8, Y8
+	VPXOR        Y4, Y9, Y9
+	VMOVDQA      Y5, 320(AX)
+	VMOVDQA      Y6, 32(AX)
+	VMOVDQA      Y7, 544(AX)
+	VMOVDQA      Y8, 256(AX)
+	VMOVDQA      Y9, 768(AX)
+	VPXOR        384(AX), Y12, Y0
+	VPXOR        96(AX), Y13, Y1
+	VPXOR        608(AX), Y14, Y2
+	VPXOR        160(AX), Y10, Y3
+	VPXOR        672(AX), Y11, Y4
+	VPSLLQ       $0x3e, Y0, Y5
+	VPSLLQ       $0x37, Y1, Y6
+	VPSLLQ       $0x27, Y2, Y7
+	VPSLLQ       $0x29, Y3, Y8
+	VPSLLQ       $0x02, Y4, Y9
+	VPSRLQ       $0x02, Y0, Y0
+	VPSRLQ       $0x09, Y1, Y1
+	VPSRLQ       $0x19, Y2, Y2
+	VPSRLQ       $0x17, Y3, Y3
+	VPSRLQ       $0x3e, Y4, Y4
+	VPOR         Y5, Y0, Y0
+	VPOR         Y6, Y1, Y1
+	VPOR         Y7, Y2, Y2
+	VPOR         Y8, Y3, Y3
+	VPOR         Y9, Y4, Y4
+	VPANDN       Y2, Y1, Y5
+	VPANDN       Y3, Y2, Y6
+	VPANDN       Y4, Y3, Y7
+	VPANDN       Y0, Y4, Y8
+	VPANDN       Y1, Y0, Y9
+	VPXOR        Y0, Y5, Y5
+	VPXOR        Y1, Y6, Y6
+	VPXOR        Y2, Y7, Y7
+	VPXOR        Y3, Y8, Y8
+	VPXOR        Y4, Y9, Y9
+	VMOVDQA      Y5, 160(AX)
+	VMOVDQA      Y6, 672(AX)
+	VMOVDQA      Y7, 384(AX)
+	VMOVDQA      Y8, 96(AX)
+	VMOVDQA      Y9, 608(AX)
+	VMOVDQA      (AX), Y0
+	VMOVDQA      32(AX), Y1
+	VMOVDQA      64(AX), Y2
+	VMOVDQA      96(AX), Y3
+	VMOVDQA      128(AX), Y4
+	VPXOR        160(AX), Y0, Y0
+	VPXOR        192(AX), Y1, Y1
+	VPXOR        224(AX), Y2, Y2
+	VPXOR        256(AX), Y3, Y3
+	VPXOR        288(AX), Y4, Y4
+	VPXOR        320(AX), Y0, Y0
+	VPXOR        352(AX), Y1, Y1
+	VPXOR        384(AX), Y2, Y2
+	VPXOR        416(AX), Y3, Y3
+	VPXOR        448(AX), Y4, Y4
+	VPXOR        480(AX), Y0, Y0
+	VPXOR        512(AX), Y1, Y1
+	VPXOR        544(AX), Y2, Y2
+	VPXOR        576(AX), Y3, Y3
+	VPXOR        608(AX), Y4, Y4
+	VPXOR        640(AX), Y0, Y0
+	VPXOR        672(AX), Y1, Y1
+	VPXOR        704(AX), Y2, Y2
+	VPXOR        736(AX), Y3, Y3
+	VPXOR        768(AX), Y4, Y4
+	VPSLLQ       $0x01, Y1, Y5
+	VPSLLQ       $0x01, Y2, Y6
+	VPSLLQ       $0x01, Y3, Y7
+	VPSLLQ       $0x01, Y4, Y8
+	VPSLLQ       $0x01, Y0, Y9
+	VPSRLQ       $0x3f, Y1, Y10
+	VPSRLQ       $0x3f, Y2, Y11
+	VPSRLQ       $0x3f, Y3, Y12
+	VPSRLQ       $0x3f, Y4, Y13
+	VPSRLQ       $0x3f, Y0, Y14
+	VPOR         Y5, Y10, Y10
+	VPOR         Y6, Y11, Y11
+	VPOR         Y7, Y12, Y12
+	VPOR         Y8, Y13, Y13
+	VPOR         Y9, Y14, Y14
+	VPXOR        Y10, Y4, Y10
+	VPXOR        Y11, Y0, Y11
+	VPXOR        Y12, Y1, Y12
+	VPXOR        Y13, Y2, Y13
+	VPXOR        Y14, Y3, Y14
+	VPXOR        (AX), Y10, Y0
+	VPXOR        352(AX), Y11, Y1
+	VPXOR        704(AX), Y12, Y2
+	VPXOR        256(AX), Y13, Y3
+	VPXOR        608(AX), Y14, Y4
+	VPSLLQ       $0x2c, Y1, Y6
+	VPSLLQ       $0x2b, Y2, Y7
+	VPSLLQ       $0x15, Y3, Y8
+	VPSLLQ       $0x0e, Y4, Y9
+	VPSRLQ       $0x14, Y1, Y1
+	VPSRLQ       $0x15, Y2, Y2
+	VPSRLQ       $0x2b, Y3, Y3
+	VPSRLQ       $0x32, Y4, Y4
+	VPOR         Y6, Y1, Y1
+	VPOR         Y7, Y2, Y2
+	VPOR         Y8, Y3, Y3
+	VPOR         Y9, Y4, Y4
+	VPANDN       Y2, Y1, Y5
+	VPANDN       Y3, Y2, Y6
+	VPANDN       Y4, Y3, Y7
+	VPANDN       Y0, Y4, Y8
+	VPANDN       Y1, Y0, Y9
+	VPXOR        Y0, Y5, Y5
+	VPXOR        Y1, Y6, Y6
+	VPXOR        Y2, Y7, Y7
+	VPXOR        Y3, Y8, Y8
+	VPXOR        Y4, Y9, Y9
+	VPBROADCASTQ 16(CX), Y0
+	VPXOR        Y0, Y5, Y5
+	VMOVDQA      Y5, (AX)
+	VMOVDQA      Y6, 352(AX)
+	VMOVDQA      Y7, 704(AX)
+	VMOVDQA      Y8, 256(AX)
+	VMOVDQA      Y9, 608(AX)
+	VPXOR        736(AX), Y13, Y0
+	VPXOR        288(AX), Y14, Y1
+	VPXOR        480(AX), Y10, Y2
+	VPXOR        32(AX), Y11, Y3
+	VPXOR        384(AX), Y12, Y4
+	VPSLLQ       $0x1c, Y0, Y5
+	VPSLLQ       $0x14, Y1, Y6
+	VPSLLQ       $0x03, Y2, Y7
+	VPSLLQ       $0x2d, Y3, Y8
+	VPSLLQ       $0x3d, Y4, Y9
+	VPSRLQ       $0x24, Y0, Y0
+	VPSRLQ       $0x2c, Y1, Y1
+	VPSRLQ       $0x3d, Y2, Y2
+	VPSRLQ       $0x13, Y3, Y3
+	VPSRLQ       $0x03, Y4, Y4
+	VPOR         Y5, Y0, Y0
+	VPOR         Y6, Y1, Y1
+	VPOR         Y7, Y2, Y2
+	VPOR         Y8, Y3, Y3
+	VPOR         Y9, Y4, Y4
+	VPANDN       Y2, Y1, Y5
+	VPANDN       Y3, Y2, Y6
+	VPANDN       Y4, Y3, Y7
+	VPANDN       Y0, Y4, Y8
+	VPANDN       Y1, Y0, Y9
+	VPXOR        Y0, Y5, Y5
+	VPXOR        Y1, Y6, Y6
+	VPXOR        Y2, Y7, Y7
+	VPXOR        Y3, Y8, Y8
+	VPXOR        Y4, Y9, Y9
+	VMOVDQA      Y5, 480(AX)
+	VMOVDQA      Y6, 32(AX)
+	VMOVDQA      Y7, 384(AX)
+	VMOVDQA      Y8, 736(AX)
+	VMOVDQA      Y9, 288(AX)
+	VPXOR        512(AX), Y11, Y0
+	VPXOR        64(AX), Y12, Y1
+	VPXOR        416(AX), Y13, Y2
+	VPXOR        768(AX), Y14, Y3
+	VPXOR        160(AX), Y10, Y4
+	VPSLLQ       $0x01, Y0, Y5
+	VPSLLQ       $0x06, Y1, Y6
+	VPSLLQ       $0x19, Y2, Y7
+	VPSLLQ       $0x08, Y3, Y8
+	VPSLLQ       $0x12, Y4, Y9
+	VPSRLQ       $0x3f, Y0, Y0
+	VPSRLQ       $0x3a, Y1, Y1
+	VPSRLQ       $0x27, Y2, Y2
+	VPSRLQ       $0x38, Y3, Y3
+	VPSRLQ       $0x2e, Y4, Y4
+	VPOR         Y5, Y0, Y0
+	VPOR         Y6, Y1, Y1
+	VPOR         Y7, Y2, Y2
+	VPOR         Y8, Y3, Y3
+	VPOR         Y9, Y4, Y4
+	VPANDN       Y2, Y1, Y5
+	VPANDN       Y3, Y2, Y6
+	VPANDN       Y4, Y3, Y7
+	VPANDN       Y0, Y4, Y8
+	VPANDN       Y1, Y0, Y9
+	VPXOR        Y0, Y5, Y5
+	VPXOR        Y1, Y6, Y6
+	VPXOR        Y2, Y7, Y7
+	VPXOR        Y3, Y8, Y8
+	VPXOR        Y4, Y9, Y9
+	VMOVDQA      Y5, 160(AX)
+	VMOVDQA      Y6, 512(AX)
+	VMOVDQA      Y7, 64(AX)
+	VMOVDQA      Y8, 416(AX)
+	VMOVDQA      Y9, 768(AX)
+	VPXOR        448(AX), Y14, Y0
+	VPXOR        640(AX), Y10, Y1
+	VPXOR        192(AX), Y11, Y2
+	VPXOR        544(AX), Y12, Y3
+	VPXOR        96(AX), Y13, Y4
+	VPSLLQ       $0x1b, Y0, Y5
+	VPSLLQ       $0x24, Y1, Y6
+	VPSLLQ       $0x0a, Y2, Y7
+	VPSLLQ       $0x0f, Y3, Y8
+	VPSLLQ       $0x38, Y4, Y9
+	VPSRLQ       $0x25, Y0, Y0
+	VPSRLQ       $0x1c, Y1, Y1
+	VPSRLQ       $0x36, Y2, Y2
+	VPSRLQ       $0x31, Y3, Y3
+	VPSRLQ       $0x08, Y4, Y4
+	VPOR         Y5, Y0, Y0
+	VPOR         Y6, Y1, Y1
+	VPOR         Y7, Y2, Y2
+	VPOR         Y8, Y3, Y3
+	VPOR         Y9, Y4, Y4
+	VPANDN       Y2, Y1, Y5
+	VPANDN       Y3, Y2, Y6
+	VPANDN       Y4, Y3, Y7
+	VPANDN       Y0, Y4, Y8
+	VPANDN       Y1, Y0, Y9
+	VPXOR        Y0, Y5, Y5
+	VPXOR        Y1, Y6, Y6
+	VPXOR        Y2, Y7, Y7
+	VPXOR        Y3, Y8, Y8
+	VPXOR        Y4, Y9, Y9
+	VMOVDQA      Y5, 640(AX)
+	VMOVDQA      Y6, 192(AX)
+	VMOVDQA      Y7, 544(AX)
+	VMOVDQA      Y8, 96(AX)
+	VMOVDQA      Y9, 448(AX)
+	VPXOR        224(AX), Y12, Y0
+	VPXOR        576(AX), Y13, Y1
+	VPXOR        128(AX), Y14, Y2
+	VPXOR        320(AX), Y10, Y3
+	VPXOR        672(AX), Y11, Y4
+	VPSLLQ       $0x3e, Y0, Y5
+	VPSLLQ       $0x37, Y1, Y6
+	VPSLLQ       $0x27, Y2, Y7
+	VPSLLQ       $0x29, Y3, Y8
+	VPSLLQ       $0x02, Y4, Y9
+	VPSRLQ       $0x02, Y0, Y0
+	VPSRLQ       $0x09, Y1, Y1
+	VPSRLQ       $0x19, Y2, Y2
+	VPSRLQ       $0x17, Y3, Y3
+	VPSRLQ       $0x3e, Y4, Y4
+	VPOR         Y5, Y0, Y0
+	VPOR         Y6, Y1, Y1
+	VPOR         Y7, Y2, Y2
+	VPOR         Y8, Y3, Y3
+	VPOR         Y9, Y4, Y4
+	VPANDN       Y2, Y1, Y5
+	VPANDN       Y3, Y2, Y6
+	VPANDN       Y4, Y3, Y7
+	VPANDN       Y0, Y4, Y8
+	VPANDN       Y1, Y0, Y9
+	VPXOR        Y0, Y5, Y5
+	VPXOR        Y1, Y6, Y6
+	VPXOR        Y2, Y7, Y7
+	VPXOR        Y3, Y8, Y8
+	VPXOR        Y4, Y9, Y9
+	VMOVDQA      Y5, 320(AX)
+	VMOVDQA      Y6, 672(AX)
+	VMOVDQA      Y7, 224(AX)
+	VMOVDQA      Y8, 576(AX)
+	VMOVDQA      Y9, 128(AX)
+	VMOVDQA      (AX), Y0
+	VMOVDQA      32(AX), Y1
+	VMOVDQA      64(AX), Y2
+	VMOVDQA      96(AX), Y3
+	VMOVDQA      128(AX), Y4
+	VPXOR        160(AX), Y0, Y0
+	VPXOR        192(AX), Y1, Y1
+	VPXOR        224(AX), Y2, Y2
+	VPXOR        256(AX), Y3, Y3
+	VPXOR        288(AX), Y4, Y4
+	VPXOR        320(AX), Y0, Y0
+	VPXOR        352(AX), Y1, Y1
+	VPXOR        384(AX), Y2, Y2
+	VPXOR        416(AX), Y3, Y3
+	VPXOR        448(AX), Y4, Y4
+	VPXOR        480(AX), Y0, Y0
+	VPXOR        512(AX), Y1, Y1
+	VPXOR        544(AX), Y2, Y2
+	VPXOR        576(AX), Y3, Y3
+	VPXOR        608(AX), Y4, Y4
+	VPXOR        640(AX), Y0, Y0
+	VPXOR        672(AX), Y1, Y1
+	VPXOR        704(AX), Y2, Y2
+	VPXOR        736(AX), Y3, Y3
+	VPXOR        768(AX), Y4, Y4
+	VPSLLQ       $0x01, Y1, Y5
+	VPSLLQ       $0x01, Y2, Y6
+	VPSLLQ       $0x01, Y3, Y7
+	VPSLLQ       $0x01, Y4, Y8
+	VPSLLQ       $0x01, Y0, Y9
+	VPSRLQ       $0x3f, Y1, Y10
+	VPSRLQ       $0x3f, Y2, Y11
+	VPSRLQ       $0x3f, Y3, Y12
+	VPSRLQ       $0x3f, Y4, Y13
+	VPSRLQ       $0x3f, Y0, Y14
+	VPOR         Y5, Y10, Y10
+	VPOR         Y6, Y11, Y11
+	VPOR         Y7, Y12, Y12
+	VPOR         Y8, Y13, Y13
+	VPOR         Y9, Y14, Y14
+	VPXOR        Y10, Y4, Y10
+	VPXOR        Y11, Y0, Y11
+	VPXOR        Y12, Y1, Y12
+	VPXOR        Y13, Y2, Y13
+	VPXOR        Y14, Y3, Y14
+	VPXOR        (AX), Y10, Y0
+	VPXOR        32(AX), Y11, Y1
+	VPXOR        64(AX), Y12, Y2
+	VPXOR        96(AX), Y13, Y3
+	VPXOR        128(AX), Y14, Y4
+	VPSLLQ       $0x2c, Y1, Y6
+	VPSLLQ       $0x2b, Y2, Y7
+	VPSLLQ       $0x15, Y3, Y8
+	VPSLLQ       $0x0e, Y4, Y9
+	VPSRLQ       $0x14, Y1, Y1
+	VPSRLQ       $0x15, Y2, Y2
+	VPSRLQ       $0x2b, Y3, Y3
+	VPSRLQ       $0x32, Y4, Y4
+	VPOR         Y6, Y1, Y1
+	VPOR         Y7, Y2, Y2
+	VPOR         Y8, Y3, Y3
+	VPOR         Y9, Y4, Y4
+	VPANDN       Y2, Y1, Y5
+	VPANDN       Y3, Y2, Y6
+	VPANDN       Y4, Y3, Y7
+	VPANDN       Y0, Y4, Y8
+	VPANDN       Y1, Y0, Y9
+	VPXOR        Y0, Y5, Y5
+	VPXOR        Y1, Y6, Y6
+	VPXOR        Y2, Y7, Y7
+	VPXOR        Y3, Y8, Y8
+	VPXOR        Y4, Y9, Y9
+	VPBROADCASTQ 24(CX), Y0
+	VPXOR        Y0, Y5, Y5
+	VMOVDQA      Y5, (AX)
+	VMOVDQA      Y6, 32(AX)
+	VMOVDQA      Y7, 64(AX)
+	VMOVDQA      Y8, 96(AX)
+	VMOVDQA      Y9, 128(AX)
+	VPXOR        256(AX), Y13, Y0
+	VPXOR        288(AX), Y14, Y1
+	VPXOR        160(AX), Y10, Y2
+	VPXOR        192(AX), Y11, Y3
+	VPXOR        224(AX), Y12, Y4
+	VPSLLQ       $0x1c, Y0, Y5
+	VPSLLQ       $0x14, Y1, Y6
+	VPSLLQ       $0x03, Y2, Y7
+	VPSLLQ       $0x2d, Y3, Y8
+	VPSLLQ       $0x3d, Y4, Y9
+	VPSRLQ       $0x24, Y0, Y0
+	VPSRLQ       $0x2c, Y1, Y1
+	VPSRLQ       $0x3d, Y2, Y2
+	VPSRLQ       $0x13, Y3, Y3
+	VPSRLQ       $0x03, Y4, Y4
+	VPOR         Y5, Y0, Y0
+	VPOR         Y6, Y1, Y1
+	VPOR         Y7, Y2, Y2
+	VPOR         Y8, Y3, Y3
+	VPOR         Y9, Y4, Y4
+	VPANDN       Y2, Y1, Y5
+	VPANDN       Y3, Y2, Y6
+	VPANDN       Y4, Y3, Y7
+	VPANDN       Y0, Y4, Y8
+	VPANDN       Y1, Y0, Y9
+	VPXOR        Y0, Y5, Y5
+	VPXOR        Y1, Y6, Y6
+	VPXOR        Y2, Y7, Y7
+	VPXOR        Y3, Y8, Y8
+	VPXOR        Y4, Y9, Y9
+	VMOVDQA      Y5, 160(AX)
+	VMOVDQA      Y6, 192(AX)
+	VMOVDQA      Y7, 224(AX)
+	VMOVDQA      Y8, 256(AX)
+	VMOVDQA      Y9, 288(AX)
+	VPXOR        352(AX), Y11, Y0
+	VPXOR        384(AX), Y12, Y1
+	VPXOR        416(AX), Y13, Y2
+	VPXOR        448(AX), Y14, Y3
+	VPXOR        320(AX), Y10, Y4
+	VPSLLQ       $0x01, Y0, Y5
+	VPSLLQ       $0x06, Y1, Y6
+	VPSLLQ       $0x19, Y2, Y7
+	VPSLLQ       $0x08, Y3, Y8
+	VPSLLQ       $0x12, Y4, Y9
+	VPSRLQ       $0x3f, Y0, Y0
+	VPSRLQ       $0x3a, Y1, Y1
+	VPSRLQ       $0x27, Y2, Y2
+	VPSRLQ       $0x38, Y3, Y3
+	VPSRLQ       $0x2e, Y4, Y4
+	VPOR         Y5, Y0, Y0
+	VPOR         Y6, Y1, Y1
+	VPOR         Y7, Y2, Y2
+	VPOR         Y8, Y3, Y3
+	VPOR         Y9, Y4, Y4
+	VPANDN       Y2, Y1, Y5
+	VPANDN       Y3, Y2, Y6
+	VPANDN       Y4, Y3, Y7
+	VPANDN       Y0, Y4, Y8
+	VPANDN       Y1, Y0, Y9
+	VPXOR        Y0, Y5, Y5
+	VPXOR        Y1, Y6, Y6
+	VPXOR        Y2, Y7, Y7
+	VPXOR        Y3, Y8, Y8
+	VPXOR        Y4, Y9, Y9
+	VMOVDQA      Y5, 320(AX)
+	VMOVDQA      Y6, 352(AX)
+	VMOVDQA      Y7, 384(AX)
+	VMOVDQA      Y8, 416(AX)
+	VMOVDQA      Y9, 448(AX)
+	VPXOR        608(AX), Y14, Y0
+	VPXOR        480(AX), Y10, Y1
+	VPXOR        512(AX), Y11, Y2
+	VPXOR        544(AX), Y12, Y3
+	VPXOR        576(AX), Y13, Y4
+	VPSLLQ       $0x1b, Y0, Y5
+	VPSLLQ       $0x24, Y1, Y6
+	VPSLLQ       $0x0a, Y2, Y7
+	VPSLLQ       $0x0f, Y3, Y8
+	VPSLLQ       $0x38, Y4, Y9
+	VPSRLQ       $0x25, Y0, Y0
+	VPSRLQ       $0x1c, Y1, Y1
+	VPSRLQ       $0x36, Y2, Y2
+	VPSRLQ       $0x31, Y3, Y3
+	VPSRLQ       $0x08, Y4, Y4
+	VPOR         Y5, Y0, Y0
+	VPOR         Y6, Y1, Y1
+	VPOR         Y7, Y2, Y2
+	VPOR         Y8, Y3, Y3
+	VPOR         Y9, Y4, Y4
+	VPANDN       Y2, Y1, Y5
+	VPANDN       Y3, Y2, Y6
+	VPANDN       Y4, Y3, Y7
+	VPANDN       Y0, Y4, Y8
+	VPANDN       Y1, Y0, Y9
+	VPXOR        Y0, Y5, Y5
+	VPXOR        Y1, Y6, Y6
+	VPXOR        Y2, Y7, Y7
+	VPXOR        Y3, Y8, Y8
+	VPXOR        Y4, Y9, Y9
+	VMOVDQA      Y5, 480(AX)
+	VMOVDQA      Y6, 512(AX)
+	VMOVDQA      Y7, 544(AX)
+	VMOVDQA      Y8, 576(AX)
+	VMOVDQA      Y9, 608(AX)
+	VPXOR        704(AX), Y12, Y0
+	VPXOR        736(AX), Y13, Y1
+	VPXOR        768(AX), Y14, Y2
+	VPXOR        640(AX), Y10, Y3
+	VPXOR        672(AX), Y11, Y4
+	VPSLLQ       $0x3e, Y0, Y5
+	VPSLLQ       $0x37, Y1, Y6
+	VPSLLQ       $0x27, Y2, Y7
+	VPSLLQ       $0x29, Y3, Y8
+	VPSLLQ       $0x02, Y4, Y9
+	VPSRLQ       $0x02, Y0, Y0
+	VPSRLQ       $0x09, Y1, Y1
+	VPSRLQ       $0x19, Y2, Y2
+	VPSRLQ       $0x17, Y3, Y3
+	VPSRLQ       $0x3e, Y4, Y4
+	VPOR         Y5, Y0, Y0
+	VPOR         Y6, Y1, Y1
+	VPOR         Y7, Y2, Y2
+	VPOR         Y8, Y3, Y3
+	VPOR         Y9, Y4, Y4
+	VPANDN       Y2, Y1, Y5
+	VPANDN       Y3, Y2, Y6
+	VPANDN       Y4, Y3, Y7
+	VPANDN       Y0, Y4, Y8
+	VPANDN       Y1, Y0, Y9
+	VPXOR        Y0, Y5, Y5
+	VPXOR        Y1, Y6, Y6
+	VPXOR        Y2, Y7, Y7
+	VPXOR        Y3, Y8, Y8
+	VPXOR        Y4, Y9, Y9
+	VMOVDQA      Y5, 640(AX)
+	VMOVDQA      Y6, 672(AX)
+	VMOVDQA      Y7, 704(AX)
+	VMOVDQA      Y8, 736(AX)
+	VMOVDQA      Y9, 768(AX)
+	ADDQ         $0x20, CX
+	SUBQ         $0x00000001, DX
+	JNZ          loop
+	RET
diff --git a/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x4stubs_amd64.go b/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x4stubs_amd64.go
new file mode 100644
index 0000000000..102fdd04d1
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/simd/keccakf1600/f1600x4stubs_amd64.go
@@ -0,0 +1,8 @@
+// Code generated by command: go run src.go -out ../../f1600x4_amd64.s -stubs ../../f1600x4stubs_amd64.go -pkg keccakf1600. DO NOT EDIT.
+
+//go:build amd64 && !purego
+
+package keccakf1600
+
+//go:noescape
+func f1600x4AVX2(state *uint64, rc *[24]uint64, turbo bool)
diff --git a/vendor/github.com/cloudflare/circl/simd/keccakf1600/fallback.go b/vendor/github.com/cloudflare/circl/simd/keccakf1600/fallback.go
new file mode 100644
index 0000000000..0da75e9b77
--- /dev/null
+++ b/vendor/github.com/cloudflare/circl/simd/keccakf1600/fallback.go
@@ -0,0 +1,8 @@
+//go:build (!amd64 && !arm64) || (arm64 && !go1.16) || purego
+// +build !amd64,!arm64 arm64,!go1.16 purego
+
+package keccakf1600
+
+func permuteSIMDx2(state []uint64, turbo bool) { permuteScalarX2(state, turbo) }
+
+func permuteSIMDx4(state []uint64, turbo bool) { permuteScalarX4(state, turbo) }
diff --git a/vendor/modules.txt b/vendor/modules.txt
index ad50f53486..427c123c5c 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -612,8 +612,8 @@ github.com/cloudevents/sdk-go/v2/event/datacodec/xml
 github.com/cloudevents/sdk-go/v2/protocol
 github.com/cloudevents/sdk-go/v2/protocol/http
 github.com/cloudevents/sdk-go/v2/types
-# github.com/cloudflare/circl v1.3.7
-## explicit; go 1.19
+# github.com/cloudflare/circl v1.6.1
+## explicit; go 1.22.0
 github.com/cloudflare/circl/dh/x25519
 github.com/cloudflare/circl/dh/x448
 github.com/cloudflare/circl/ecc/goldilocks
@@ -626,6 +626,11 @@ github.com/cloudflare/circl/math/mlsbset
 github.com/cloudflare/circl/sign
 github.com/cloudflare/circl/sign/ed25519
 github.com/cloudflare/circl/sign/ed448
+github.com/cloudflare/circl/sign/internal/dilithium
+github.com/cloudflare/circl/sign/internal/dilithium/params
+github.com/cloudflare/circl/sign/mldsa/mldsa65
+github.com/cloudflare/circl/sign/mldsa/mldsa65/internal
+github.com/cloudflare/circl/simd/keccakf1600
 # github.com/cncf/xds/go v0.0.0-20250326154945-ae57f3c0d45f
 ## explicit; go 1.19
 github.com/cncf/xds/go/udpa/annotations