Architecture: x86_64 using:
- AVX2 (Advanced Vector Extensions 2)
-config CRYPTO_POLYVAL_CLMUL_NI
- tristate "Hash functions: POLYVAL (CLMUL-NI)"
- depends on 64BIT
- select CRYPTO_POLYVAL
- help
- POLYVAL hash function for HCTR2
-
- Architecture: x86_64 using:
- - CLMUL-NI (carry-less multiplication new instructions)
-
config CRYPTO_SM3_AVX_X86_64
tristate "Hash functions: SM3 (AVX)"
depends on 64BIT
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
-obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o
-polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o
-
obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
+++ /dev/null
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Glue code for POLYVAL using PCMULQDQ-NI
- *
- * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi>
- * Copyright (c) 2009 Intel Corp.
- * Author: Huang Ying <ying.huang@intel.com>
- * Copyright 2021 Google LLC
- */
-
-/*
- * Glue code based on ghash-clmulni-intel_glue.c.
- *
- * This implementation of POLYVAL uses montgomery multiplication
- * accelerated by PCLMULQDQ-NI to implement the finite field
- * operations.
- */
-
-#include <asm/cpu_device_id.h>
-#include <asm/fpu/api.h>
-#include <crypto/internal/hash.h>
-#include <crypto/polyval.h>
-#include <crypto/utils.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#define POLYVAL_ALIGN 16
-#define POLYVAL_ALIGN_ATTR __aligned(POLYVAL_ALIGN)
-#define POLYVAL_ALIGN_EXTRA ((POLYVAL_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))
-#define POLYVAL_CTX_SIZE (sizeof(struct polyval_tfm_ctx) + POLYVAL_ALIGN_EXTRA)
-#define NUM_KEY_POWERS 8
-
-struct polyval_tfm_ctx {
- /*
- * These powers must be in the order h^8, ..., h^1.
- */
- u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE] POLYVAL_ALIGN_ATTR;
-};
-
-struct polyval_desc_ctx {
- u8 buffer[POLYVAL_BLOCK_SIZE];
-};
-
-asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
- const u8 *in, size_t nblocks, u8 *accumulator);
-asmlinkage void clmul_polyval_mul(u8 *op1, const u8 *op2);
-
-static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm)
-{
- return PTR_ALIGN(crypto_shash_ctx(tfm), POLYVAL_ALIGN);
-}
-
-static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
- const u8 *in, size_t nblocks, u8 *accumulator)
-{
- kernel_fpu_begin();
- clmul_polyval_update(keys, in, nblocks, accumulator);
- kernel_fpu_end();
-}
-
-static void internal_polyval_mul(u8 *op1, const u8 *op2)
-{
- kernel_fpu_begin();
- clmul_polyval_mul(op1, op2);
- kernel_fpu_end();
-}
-
-static int polyval_x86_setkey(struct crypto_shash *tfm,
- const u8 *key, unsigned int keylen)
-{
- struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(tfm);
- int i;
-
- if (keylen != POLYVAL_BLOCK_SIZE)
- return -EINVAL;
-
- memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE);
-
- for (i = NUM_KEY_POWERS-2; i >= 0; i--) {
- memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE);
- internal_polyval_mul(tctx->key_powers[i],
- tctx->key_powers[i+1]);
- }
-
- return 0;
-}
-
-static int polyval_x86_init(struct shash_desc *desc)
-{
- struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-
- memset(dctx, 0, sizeof(*dctx));
-
- return 0;
-}
-
-static int polyval_x86_update(struct shash_desc *desc,
- const u8 *src, unsigned int srclen)
-{
- struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
- const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
- unsigned int nblocks;
-
- do {
- /* Allow rescheduling every 4K bytes. */
- nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
- internal_polyval_update(tctx, src, nblocks, dctx->buffer);
- srclen -= nblocks * POLYVAL_BLOCK_SIZE;
- src += nblocks * POLYVAL_BLOCK_SIZE;
- } while (srclen >= POLYVAL_BLOCK_SIZE);
-
- return srclen;
-}
-
-static int polyval_x86_finup(struct shash_desc *desc, const u8 *src,
- unsigned int len, u8 *dst)
-{
- struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
- const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
-
- if (len) {
- crypto_xor(dctx->buffer, src, len);
- internal_polyval_mul(dctx->buffer,
- tctx->key_powers[NUM_KEY_POWERS-1]);
- }
-
- memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE);
-
- return 0;
-}
-
-static struct shash_alg polyval_alg = {
- .digestsize = POLYVAL_DIGEST_SIZE,
- .init = polyval_x86_init,
- .update = polyval_x86_update,
- .finup = polyval_x86_finup,
- .setkey = polyval_x86_setkey,
- .descsize = sizeof(struct polyval_desc_ctx),
- .base = {
- .cra_name = "polyval",
- .cra_driver_name = "polyval-clmulni",
- .cra_priority = 200,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = POLYVAL_BLOCK_SIZE,
- .cra_ctxsize = POLYVAL_CTX_SIZE,
- .cra_module = THIS_MODULE,
- },
-};
-
-__maybe_unused static const struct x86_cpu_id pcmul_cpu_id[] = {
- X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL),
- {}
-};
-MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id);
-
-static int __init polyval_clmulni_mod_init(void)
-{
- if (!x86_match_cpu(pcmul_cpu_id))
- return -ENODEV;
-
- if (!boot_cpu_has(X86_FEATURE_AVX))
- return -ENODEV;
-
- return crypto_register_shash(&polyval_alg);
-}
-
-static void __exit polyval_clmulni_mod_exit(void)
-{
- crypto_unregister_shash(&polyval_alg);
-}
-
-module_init(polyval_clmulni_mod_init);
-module_exit(polyval_clmulni_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("POLYVAL hash function accelerated by PCLMULQDQ-NI");
-MODULE_ALIAS_CRYPTO("polyval");
-MODULE_ALIAS_CRYPTO("polyval-clmulni");
#ifdef CONFIG_ARM64
/** @h_powers: Powers of the hash key H^8 through H^1 */
struct polyval_elem h_powers[8];
+#elif defined(CONFIG_X86)
+ /** @h_powers: Powers of the hash key H^8 through H^1 */
+ struct polyval_elem h_powers[8];
#else
#error "Unhandled arch"
#endif
bool
depends on CRYPTO_LIB_POLYVAL && !UML
default y if ARM64 && KERNEL_MODE_NEON
+ default y if X86_64
config CRYPTO_LIB_CHACHA20POLY1305
tristate
ifeq ($(CONFIG_CRYPTO_LIB_POLYVAL_ARCH),y)
CFLAGS_polyval.o += -I$(src)/$(SRCARCH)
libpolyval-$(CONFIG_ARM64) += arm64/polyval-ce-core.o
+libpolyval-$(CONFIG_X86) += x86/polyval-pclmul-avx.o
endif
################################################################################
#define MI %xmm14
#define SUM %xmm15
-#define KEY_POWERS %rdi
-#define MSG %rsi
-#define BLOCKS_LEFT %rdx
-#define ACCUMULATOR %rcx
+#define ACCUMULATOR %rdi
+#define KEY_POWERS %rsi
+#define MSG %rdx
+#define BLOCKS_LEFT %rcx
#define TMP %rax
.section .rodata.cst16.gstar, "aM", @progbits, 16
movups (MSG), %xmm0
pxor SUM, %xmm0
- movaps (KEY_POWERS), %xmm1
+ movups (KEY_POWERS), %xmm1
schoolbook1_noload
dec BLOCKS_LEFT
addq $16, MSG
.endm
/*
- * Perform montgomery multiplication in GF(2^128) and store result in op1.
+ * Computes a = a * b * x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1.
*
- * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
- * If op1, op2 are in montgomery form, this computes the montgomery
- * form of op1*op2.
- *
- * void clmul_polyval_mul(u8 *op1, const u8 *op2);
+ * void polyval_mul_pclmul_avx(struct polyval_elem *a,
+ * const struct polyval_elem *b);
*/
-SYM_FUNC_START(clmul_polyval_mul)
+SYM_FUNC_START(polyval_mul_pclmul_avx)
FRAME_BEGIN
vmovdqa .Lgstar(%rip), GSTAR
movups (%rdi), %xmm0
movups SUM, (%rdi)
FRAME_END
RET
-SYM_FUNC_END(clmul_polyval_mul)
+SYM_FUNC_END(polyval_mul_pclmul_avx)
/*
* Perform polynomial evaluation as specified by POLYVAL. This computes:
* h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
* where n=nblocks, h is the hash key, and m_i are the message blocks.
*
- * rdi - pointer to precomputed key powers h^8 ... h^1
- * rsi - pointer to message blocks
- * rdx - number of blocks to hash
- * rcx - pointer to the accumulator
+ * rdi - pointer to the accumulator
+ * rsi - pointer to precomputed key powers h^8 ... h^1
+ * rdx - pointer to message blocks
+ * rcx - number of blocks to hash
*
- * void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
- * const u8 *in, size_t nblocks, u8 *accumulator);
+ * void polyval_blocks_pclmul_avx(struct polyval_elem *acc,
+ * const struct polyval_key *key,
+ * const u8 *data, size_t nblocks);
*/
-SYM_FUNC_START(clmul_polyval_update)
+SYM_FUNC_START(polyval_blocks_pclmul_avx)
FRAME_BEGIN
vmovdqa .Lgstar(%rip), GSTAR
movups (ACCUMULATOR), SUM
movups SUM, (ACCUMULATOR)
FRAME_END
RET
-SYM_FUNC_END(clmul_polyval_update)
+SYM_FUNC_END(polyval_blocks_pclmul_avx)
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * POLYVAL library functions, x86_64 optimized
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <linux/cpufeature.h>
+
+#define NUM_H_POWERS 8
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmul_avx);
+
+asmlinkage void polyval_mul_pclmul_avx(struct polyval_elem *a,
+ const struct polyval_elem *b);
+asmlinkage void polyval_blocks_pclmul_avx(struct polyval_elem *acc,
+ const struct polyval_key *key,
+ const u8 *data, size_t nblocks);
+
+static void polyval_preparekey_arch(struct polyval_key *key,
+ const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+ static_assert(ARRAY_SIZE(key->h_powers) == NUM_H_POWERS);
+ memcpy(&key->h_powers[NUM_H_POWERS - 1], raw_key, POLYVAL_BLOCK_SIZE);
+ if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+ kernel_fpu_begin();
+ for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+ key->h_powers[i] = key->h_powers[i + 1];
+ polyval_mul_pclmul_avx(
+ &key->h_powers[i],
+ &key->h_powers[NUM_H_POWERS - 1]);
+ }
+ kernel_fpu_end();
+ } else {
+ for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+ key->h_powers[i] = key->h_powers[i + 1];
+ polyval_mul_generic(&key->h_powers[i],
+ &key->h_powers[NUM_H_POWERS - 1]);
+ }
+ }
+}
+
+static void polyval_mul_arch(struct polyval_elem *acc,
+ const struct polyval_key *key)
+{
+ if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+ kernel_fpu_begin();
+ polyval_mul_pclmul_avx(acc, &key->h_powers[NUM_H_POWERS - 1]);
+ kernel_fpu_end();
+ } else {
+ polyval_mul_generic(acc, &key->h_powers[NUM_H_POWERS - 1]);
+ }
+}
+
+static void polyval_blocks_arch(struct polyval_elem *acc,
+ const struct polyval_key *key,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+ do {
+ /* Allow rescheduling every 4 KiB. */
+ size_t n = min_t(size_t, nblocks,
+ 4096 / POLYVAL_BLOCK_SIZE);
+
+ kernel_fpu_begin();
+ polyval_blocks_pclmul_avx(acc, key, data, n);
+ kernel_fpu_end();
+ data += n * POLYVAL_BLOCK_SIZE;
+ nblocks -= n;
+ } while (nblocks);
+ } else {
+ polyval_blocks_generic(acc, &key->h_powers[NUM_H_POWERS - 1],
+ data, nblocks);
+ }
+}
+
+#define polyval_mod_init_arch polyval_mod_init_arch
+static void polyval_mod_init_arch(void)
+{
+ if (boot_cpu_has(X86_FEATURE_PCLMULQDQ) &&
+ boot_cpu_has(X86_FEATURE_AVX))
+ static_branch_enable(&have_pclmul_avx);
+}