/* SPDX-License-Identifier: GPL-2.0-or-later */ /* Copyright 2025 Google LLC */ /* * This file is a "template" that generates a CRC function optimized using the * RISC-V Zbc (scalar carryless multiplication) extension. The includer of this * file must define the following parameters to specify the type of CRC: * * crc_t: the data type of the CRC, e.g. u32 for a 32-bit CRC * LSB_CRC: 0 for a msb (most-significant-bit) first CRC, i.e. natural * mapping between bits and polynomial coefficients * 1 for a lsb (least-significant-bit) first CRC, i.e. reflected * mapping between bits and polynomial coefficients */ #include #include #define CRC_BITS (8 * sizeof(crc_t)) /* a.k.a. 'n' */ static inline unsigned long clmul(unsigned long a, unsigned long b) { unsigned long res; asm(".option push\n" ".option arch,+zbc\n" "clmul %0, %1, %2\n" ".option pop\n" : "=r" (res) : "r" (a), "r" (b)); return res; } static inline unsigned long clmulh(unsigned long a, unsigned long b) { unsigned long res; asm(".option push\n" ".option arch,+zbc\n" "clmulh %0, %1, %2\n" ".option pop\n" : "=r" (res) : "r" (a), "r" (b)); return res; } static inline unsigned long clmulr(unsigned long a, unsigned long b) { unsigned long res; asm(".option push\n" ".option arch,+zbc\n" "clmulr %0, %1, %2\n" ".option pop\n" : "=r" (res) : "r" (a), "r" (b)); return res; } /* * crc_load_long() loads one "unsigned long" of aligned data bytes, producing a * polynomial whose bit order matches the CRC's bit order. */ #ifdef CONFIG_64BIT # if LSB_CRC # define crc_load_long(x) le64_to_cpup(x) # else # define crc_load_long(x) be64_to_cpup(x) # endif #else # if LSB_CRC # define crc_load_long(x) le32_to_cpup(x) # else # define crc_load_long(x) be32_to_cpup(x) # endif #endif /* XOR @crc into the end of @msgpoly that represents the high-order terms. */ static inline unsigned long crc_clmul_prep(crc_t crc, unsigned long msgpoly) { #if LSB_CRC return msgpoly ^ crc; #else return msgpoly ^ ((unsigned long)crc << (BITS_PER_LONG - CRC_BITS)); #endif } /* * Multiply the long-sized @msgpoly by x^n (a.k.a. x^CRC_BITS) and reduce it * modulo the generator polynomial G. This gives the CRC of @msgpoly. */ static inline crc_t crc_clmul_long(unsigned long msgpoly, const struct crc_clmul_consts *consts) { unsigned long tmp; /* * First step of Barrett reduction with integrated multiplication by * x^n: calculate floor((msgpoly * x^n) / G). This is the value by * which G needs to be multiplied to cancel out the x^n and higher terms * of msgpoly * x^n. Do it using the following formula: * * msb-first: * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G)) / x^(BITS_PER_LONG-1)) * lsb-first: * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G) * x) / x^BITS_PER_LONG) * * barrett_reduction_const_1 contains floor(x^(BITS_PER_LONG-1+n) / G), * which fits a long exactly. Using any lower power of x there would * not carry enough precision through the calculation, while using any * higher power of x would require extra instructions to handle a wider * multiplication. In the msb-first case, using this power of x results * in needing a floored division by x^(BITS_PER_LONG-1), which matches * what clmulr produces. In the lsb-first case, a factor of x gets * implicitly introduced by each carryless multiplication (shown as * '* x' above), and the floored division instead needs to be by * x^BITS_PER_LONG which matches what clmul produces. */ #if LSB_CRC tmp = clmul(msgpoly, consts->barrett_reduction_const_1); #else tmp = clmulr(msgpoly, consts->barrett_reduction_const_1); #endif /* * Second step of Barrett reduction: * * crc := (msgpoly * x^n) + (G * floor((msgpoly * x^n) / G)) * * This reduces (msgpoly * x^n) modulo G by adding the appropriate * multiple of G to it. The result uses only the x^0..x^(n-1) terms. * HOWEVER, since the unreduced value (msgpoly * x^n) is zero in those * terms in the first place, it is more efficient to do the equivalent: * * crc := ((G - x^n) * floor((msgpoly * x^n) / G)) mod x^n * * In the lsb-first case further modify it to the following which avoids * a shift, as the crc ends up in the physically low n bits from clmulr: * * product := ((G - x^n) * x^(BITS_PER_LONG - n)) * floor((msgpoly * x^n) / G) * x * crc := floor(product / x^(BITS_PER_LONG + 1 - n)) mod x^n * * barrett_reduction_const_2 contains the constant multiplier (G - x^n) * or (G - x^n) * x^(BITS_PER_LONG - n) from the formulas above. The * cast of the result to crc_t is essential, as it applies the mod x^n! */ #if LSB_CRC return clmulr(tmp, consts->barrett_reduction_const_2); #else return clmul(tmp, consts->barrett_reduction_const_2); #endif } /* Update @crc with the data from @msgpoly. */ static inline crc_t crc_clmul_update_long(crc_t crc, unsigned long msgpoly, const struct crc_clmul_consts *consts) { return crc_clmul_long(crc_clmul_prep(crc, msgpoly), consts); } /* Update @crc with 1 <= @len < sizeof(unsigned long) bytes of data. */ static inline crc_t crc_clmul_update_partial(crc_t crc, const u8 *p, size_t len, const struct crc_clmul_consts *consts) { unsigned long msgpoly; size_t i; #if LSB_CRC msgpoly = (unsigned long)p[0] << (BITS_PER_LONG - 8); for (i = 1; i < len; i++) msgpoly = (msgpoly >> 8) ^ ((unsigned long)p[i] << (BITS_PER_LONG - 8)); #else msgpoly = p[0]; for (i = 1; i < len; i++) msgpoly = (msgpoly << 8) ^ p[i]; #endif if (len >= sizeof(crc_t)) { #if LSB_CRC msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len); #else msgpoly ^= (unsigned long)crc << (8*len - CRC_BITS); #endif return crc_clmul_long(msgpoly, consts); } #if LSB_CRC msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len); return crc_clmul_long(msgpoly, consts) ^ (crc >> (8*len)); #else msgpoly ^= crc >> (CRC_BITS - 8*len); return crc_clmul_long(msgpoly, consts) ^ (crc << (8*len)); #endif } static inline crc_t crc_clmul(crc_t crc, const void *p, size_t len, const struct crc_clmul_consts *consts) { size_t align; /* This implementation assumes that the CRC fits in an unsigned long. */ BUILD_BUG_ON(sizeof(crc_t) > sizeof(unsigned long)); /* If the buffer is not long-aligned, align it. */ align = (unsigned long)p % sizeof(unsigned long); if (align && len) { align = min(sizeof(unsigned long) - align, len); crc = crc_clmul_update_partial(crc, p, align, consts); p += align; len -= align; } if (len >= 4 * sizeof(unsigned long)) { unsigned long m0, m1; m0 = crc_clmul_prep(crc, crc_load_long(p)); m1 = crc_load_long(p + sizeof(unsigned long)); p += 2 * sizeof(unsigned long); len -= 2 * sizeof(unsigned long); /* * Main loop. Each iteration starts with a message polynomial * (x^BITS_PER_LONG)*m0 + m1, then logically extends it by two * more longs of data to form x^(3*BITS_PER_LONG)*m0 + * x^(2*BITS_PER_LONG)*m1 + x^BITS_PER_LONG*m2 + m3, then * "folds" that back into a congruent (modulo G) value that uses * just m0 and m1 again. This is done by multiplying m0 by the * precomputed constant (x^(3*BITS_PER_LONG) mod G) and m1 by * the precomputed constant (x^(2*BITS_PER_LONG) mod G), then * adding the results to m2 and m3 as appropriate. Each such * multiplication produces a result twice the length of a long, * which in RISC-V is two instructions clmul and clmulh. * * This could be changed to fold across more than 2 longs at a * time if there is a CPU that can take advantage of it. */ do { unsigned long p0, p1, p2, p3; p0 = clmulh(m0, consts->fold_across_2_longs_const_hi); p1 = clmul(m0, consts->fold_across_2_longs_const_hi); p2 = clmulh(m1, consts->fold_across_2_longs_const_lo); p3 = clmul(m1, consts->fold_across_2_longs_const_lo); m0 = (LSB_CRC ? p1 ^ p3 : p0 ^ p2) ^ crc_load_long(p); m1 = (LSB_CRC ? p0 ^ p2 : p1 ^ p3) ^ crc_load_long(p + sizeof(unsigned long)); p += 2 * sizeof(unsigned long); len -= 2 * sizeof(unsigned long); } while (len >= 2 * sizeof(unsigned long)); crc = crc_clmul_long(m0, consts); crc = crc_clmul_update_long(crc, m1, consts); } while (len >= sizeof(unsigned long)) { crc = crc_clmul_update_long(crc, crc_load_long(p), consts); p += sizeof(unsigned long); len -= sizeof(unsigned long); } if (len) crc = crc_clmul_update_partial(crc, p, len, consts); return crc; }