linux-stable-mirror/scripts/gen-crc-consts.py
Eric Biggers bbe2610bc5 riscv/crc: add "template" for Zbc optimized CRC functions
Add a "template" crc-clmul-template.h that can generate RISC-V Zbc
optimized CRC functions.  Each generated CRC function is parameterized
by CRC length and bit order, and it accepts a pointer to the constants
struct required for the specific CRC polynomial desired.  Update
gen-crc-consts.py to support generating the needed constants structs.

This makes it possible to easily wire up a Zbc optimized implementation
of almost any CRC.

The design generally follows what I did for x86, but it is simplified by
using RISC-V's scalar carryless multiplication Zbc, which has no
equivalent on x86.  RISC-V's clmulr instruction is also helpful.  A
potential switch to Zvbc (or support for Zvbc alongside Zbc) is left for
future work.  For long messages Zvbc should be fastest, but it would
need to be shown to be worthwhile over just using Zbc which is
significantly more convenient to use, especially in the kernel context.

Compared to the existing Zbc-optimized CRC32 code and the earlier
proposed Zbc-optimized CRC-T10DIF code
(https://lore.kernel.org/r/20250211071101.181652-1-zhihang.shao.iscas@gmail.com),
this submission deduplicates the code among CRC variants and is
significantly more optimized.  It uses "folding" to take better
advantage of instruction-level parallelism (to a more limited extent
than x86 for now, but it could be extended to more), it reworks the
Barrett reduction to eliminate unnecessary instructions, and it
documents all the math used and makes all the constants reproducible.

Tested-by: Björn Töpel <bjorn@rivosinc.com>
Acked-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Link: https://lore.kernel.org/r/20250216225530.306980-2-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
2025-03-10 09:29:08 -07:00

292 lines
12 KiB
Python
Executable File

#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-2.0-or-later
#
# Script that generates constants for computing the given CRC variant(s).
#
# Copyright 2025 Google LLC
#
# Author: Eric Biggers <ebiggers@google.com>
import sys
# XOR (add) an iterable of polynomials.
def xor(iterable):
res = 0
for val in iterable:
res ^= val
return res
# Multiply two polynomials.
def clmul(a, b):
return xor(a << i for i in range(b.bit_length()) if (b & (1 << i)) != 0)
# Polynomial division floor(a / b).
def div(a, b):
q = 0
while a.bit_length() >= b.bit_length():
q ^= 1 << (a.bit_length() - b.bit_length())
a ^= b << (a.bit_length() - b.bit_length())
return q
# Reduce the polynomial 'a' modulo the polynomial 'b'.
def reduce(a, b):
return a ^ clmul(div(a, b), b)
# Reflect the bits of a polynomial.
def bitreflect(poly, num_bits):
assert poly.bit_length() <= num_bits
return xor(((poly >> i) & 1) << (num_bits - 1 - i) for i in range(num_bits))
# Format a polynomial as hex. Bit-reflect it if the CRC is lsb-first.
def fmt_poly(variant, poly, num_bits):
if variant.lsb:
poly = bitreflect(poly, num_bits)
return f'0x{poly:0{2*num_bits//8}x}'
# Print a pair of 64-bit polynomial multipliers. They are always passed in the
# order [HI64_TERMS, LO64_TERMS] but will be printed in the appropriate order.
def print_mult_pair(variant, mults):
mults = list(mults if variant.lsb else reversed(mults))
terms = ['HI64_TERMS', 'LO64_TERMS'] if variant.lsb else ['LO64_TERMS', 'HI64_TERMS']
for i in range(2):
print(f'\t\t{fmt_poly(variant, mults[i]["val"], 64)},\t/* {terms[i]}: {mults[i]["desc"]} */')
# Pretty-print a polynomial.
def pprint_poly(prefix, poly):
terms = [f'x^{i}' for i in reversed(range(poly.bit_length()))
if (poly & (1 << i)) != 0]
j = 0
while j < len(terms):
s = prefix + terms[j] + (' +' if j < len(terms) - 1 else '')
j += 1
while j < len(terms) and len(s) < 73:
s += ' ' + terms[j] + (' +' if j < len(terms) - 1 else '')
j += 1
print(s)
prefix = ' * ' + (' ' * (len(prefix) - 3))
# Print a comment describing constants generated for the given CRC variant.
def print_header(variant, what):
print('/*')
s = f'{"least" if variant.lsb else "most"}-significant-bit-first CRC-{variant.bits}'
print(f' * {what} generated for {s} using')
pprint_poly(' * G(x) = ', variant.G)
print(' */')
class CrcVariant:
def __init__(self, bits, generator_poly, bit_order):
self.bits = bits
if bit_order not in ['lsb', 'msb']:
raise ValueError('Invalid value for bit_order')
self.lsb = bit_order == 'lsb'
self.name = f'crc{bits}_{bit_order}_0x{generator_poly:0{(2*bits+7)//8}x}'
if self.lsb:
generator_poly = bitreflect(generator_poly, bits)
self.G = generator_poly ^ (1 << bits)
# Generate tables for CRC computation using the "slice-by-N" method.
# N=1 corresponds to the traditional byte-at-a-time table.
def gen_slicebyN_tables(variants, n):
for v in variants:
print('')
print_header(v, f'Slice-by-{n} CRC table')
print(f'static const u{v.bits} __maybe_unused {v.name}_table[{256*n}] = {{')
s = ''
for i in range(256 * n):
# The i'th table entry is the CRC of the message consisting of byte
# i % 256 followed by i // 256 zero bytes.
poly = (bitreflect(i % 256, 8) if v.lsb else i % 256) << (v.bits + 8*(i//256))
next_entry = fmt_poly(v, reduce(poly, v.G), v.bits) + ','
if len(s + next_entry) > 71:
print(f'\t{s}')
s = ''
s += (' ' if s else '') + next_entry
if s:
print(f'\t{s}')
print('};')
def print_riscv_const(v, bits_per_long, name, val, desc):
print(f'\t.{name} = {fmt_poly(v, val, bits_per_long)}, /* {desc} */')
def do_gen_riscv_clmul_consts(v, bits_per_long):
(G, n, lsb) = (v.G, v.bits, v.lsb)
pow_of_x = 3 * bits_per_long - (1 if lsb else 0)
print_riscv_const(v, bits_per_long, 'fold_across_2_longs_const_hi',
reduce(1 << pow_of_x, G), f'x^{pow_of_x} mod G')
pow_of_x = 2 * bits_per_long - (1 if lsb else 0)
print_riscv_const(v, bits_per_long, 'fold_across_2_longs_const_lo',
reduce(1 << pow_of_x, G), f'x^{pow_of_x} mod G')
pow_of_x = bits_per_long - 1 + n
print_riscv_const(v, bits_per_long, 'barrett_reduction_const_1',
div(1 << pow_of_x, G), f'floor(x^{pow_of_x} / G)')
val = G - (1 << n)
desc = f'G - x^{n}'
if lsb:
val <<= bits_per_long - n
desc = f'({desc}) * x^{bits_per_long - n}'
print_riscv_const(v, bits_per_long, 'barrett_reduction_const_2', val, desc)
def gen_riscv_clmul_consts(variants):
print('')
print('struct crc_clmul_consts {');
print('\tunsigned long fold_across_2_longs_const_hi;');
print('\tunsigned long fold_across_2_longs_const_lo;');
print('\tunsigned long barrett_reduction_const_1;');
print('\tunsigned long barrett_reduction_const_2;');
print('};');
for v in variants:
print('');
if v.bits > 32:
print_header(v, 'Constants')
print('#ifdef CONFIG_64BIT')
print(f'static const struct crc_clmul_consts {v.name}_consts __maybe_unused = {{')
do_gen_riscv_clmul_consts(v, 64)
print('};')
print('#endif')
else:
print_header(v, 'Constants')
print(f'static const struct crc_clmul_consts {v.name}_consts __maybe_unused = {{')
print('#ifdef CONFIG_64BIT')
do_gen_riscv_clmul_consts(v, 64)
print('#else')
do_gen_riscv_clmul_consts(v, 32)
print('#endif')
print('};')
# Generate constants for carryless multiplication based CRC computation.
def gen_x86_pclmul_consts(variants):
# These are the distances, in bits, to generate folding constants for.
FOLD_DISTANCES = [2048, 1024, 512, 256, 128]
for v in variants:
(G, n, lsb) = (v.G, v.bits, v.lsb)
print('')
print_header(v, 'CRC folding constants')
print('static const struct {')
if not lsb:
print('\tu8 bswap_mask[16];')
for i in FOLD_DISTANCES:
print(f'\tu64 fold_across_{i}_bits_consts[2];')
print('\tu8 shuf_table[48];')
print('\tu64 barrett_reduction_consts[2];')
print(f'}} {v.name}_consts ____cacheline_aligned __maybe_unused = {{')
# Byte-reflection mask, needed for msb-first CRCs
if not lsb:
print('\t.bswap_mask = {' + ', '.join(str(i) for i in reversed(range(16))) + '},')
# Fold constants for all distances down to 128 bits
for i in FOLD_DISTANCES:
print(f'\t.fold_across_{i}_bits_consts = {{')
# Given 64x64 => 128 bit carryless multiplication instructions, two
# 64-bit fold constants are needed per "fold distance" i: one for
# HI64_TERMS that is basically x^(i+64) mod G and one for LO64_TERMS
# that is basically x^i mod G. The exact values however undergo a
# couple adjustments, described below.
mults = []
for j in [64, 0]:
pow_of_x = i + j
if lsb:
# Each 64x64 => 128 bit carryless multiplication instruction
# actually generates a 127-bit product in physical bits 0
# through 126, which in the lsb-first case represent the
# coefficients of x^1 through x^127, not x^0 through x^126.
# Thus in the lsb-first case, each such instruction
# implicitly adds an extra factor of x. The below removes a
# factor of x from each constant to compensate for this.
# For n < 64 the x could be removed from either the reduced
# part or unreduced part, but for n == 64 the reduced part
# is the only option. Just always use the reduced part.
pow_of_x -= 1
# Make a factor of x^(64-n) be applied unreduced rather than
# reduced, to cause the product to use only the x^(64-n) and
# higher terms and always be zero in the lower terms. Usually
# this makes no difference as it does not affect the product's
# congruence class mod G and the constant remains 64-bit, but
# part of the final reduction from 128 bits does rely on this
# property when it reuses one of the constants.
pow_of_x -= 64 - n
mults.append({ 'val': reduce(1 << pow_of_x, G) << (64 - n),
'desc': f'(x^{pow_of_x} mod G) * x^{64-n}' })
print_mult_pair(v, mults)
print('\t},')
# Shuffle table for handling 1..15 bytes at end
print('\t.shuf_table = {')
print('\t\t' + (16*'-1, ').rstrip())
print('\t\t' + ''.join(f'{i:2}, ' for i in range(16)).rstrip())
print('\t\t' + (16*'-1, ').rstrip())
print('\t},')
# Barrett reduction constants for reducing 128 bits to the final CRC
print('\t.barrett_reduction_consts = {')
mults = []
val = div(1 << (63+n), G)
desc = f'floor(x^{63+n} / G)'
if not lsb:
val = (val << 1) - (1 << 64)
desc = f'({desc} * x) - x^64'
mults.append({ 'val': val, 'desc': desc })
val = G - (1 << n)
desc = f'G - x^{n}'
if lsb and n == 64:
assert (val & 1) != 0 # The x^0 term should always be nonzero.
val >>= 1
desc = f'({desc} - x^0) / x'
else:
pow_of_x = 64 - n - (1 if lsb else 0)
val <<= pow_of_x
desc = f'({desc}) * x^{pow_of_x}'
mults.append({ 'val': val, 'desc': desc })
print_mult_pair(v, mults)
print('\t},')
print('};')
def parse_crc_variants(vars_string):
variants = []
for var_string in vars_string.split(','):
bits, bit_order, generator_poly = var_string.split('_')
assert bits.startswith('crc')
bits = int(bits.removeprefix('crc'))
assert generator_poly.startswith('0x')
generator_poly = generator_poly.removeprefix('0x')
assert len(generator_poly) % 2 == 0
generator_poly = int(generator_poly, 16)
variants.append(CrcVariant(bits, generator_poly, bit_order))
return variants
if len(sys.argv) != 3:
sys.stderr.write(f'Usage: {sys.argv[0]} CONSTS_TYPE[,CONSTS_TYPE]... CRC_VARIANT[,CRC_VARIANT]...\n')
sys.stderr.write(' CONSTS_TYPE can be sliceby[1-8], riscv_clmul, or x86_pclmul\n')
sys.stderr.write(' CRC_VARIANT is crc${num_bits}_${bit_order}_${generator_poly_as_hex}\n')
sys.stderr.write(' E.g. crc16_msb_0x8bb7 or crc32_lsb_0xedb88320\n')
sys.stderr.write(' Polynomial must use the given bit_order and exclude x^{num_bits}\n')
sys.exit(1)
print('/* SPDX-License-Identifier: GPL-2.0-or-later */')
print('/*')
print(' * CRC constants generated by:')
print(' *')
print(f' *\t{sys.argv[0]} {" ".join(sys.argv[1:])}')
print(' *')
print(' * Do not edit manually.')
print(' */')
consts_types = sys.argv[1].split(',')
variants = parse_crc_variants(sys.argv[2])
for consts_type in consts_types:
if consts_type.startswith('sliceby'):
gen_slicebyN_tables(variants, int(consts_type.removeprefix('sliceby')))
elif consts_type == 'riscv_clmul':
gen_riscv_clmul_consts(variants)
elif consts_type == 'x86_pclmul':
gen_x86_pclmul_consts(variants)
else:
raise ValueError(f'Unknown consts_type: {consts_type}')