|
| 1 | +From e9a688bcb19348862afe30d7c85bc37c4c293471 Mon Sep 17 00:00:00 2001 |
| 2 | +From: "Jason A. Donenfeld" <Jason@zx2c4.com> |
| 3 | +Date: Sat, 8 Oct 2022 20:42:54 -0600 |
| 4 | +Subject: [PATCH] random: use rejection sampling for uniform bounded random |
| 5 | + integers |
| 6 | + |
| 7 | +Until the very recent commits, many bounded random integers were |
| 8 | +calculated using `get_random_u32() % max_plus_one`, which not only |
| 9 | +incurs the price of a division -- indicating performance mostly was not |
| 10 | +a real issue -- but also does not result in a uniformly distributed |
| 11 | +output if max_plus_one is not a power of two. Recent commits moved to |
| 12 | +using `prandom_u32_max(max_plus_one)`, which replaces the division with |
| 13 | +a faster multiplication, but still does not solve the issue with |
| 14 | +non-uniform output. |
| 15 | + |
| 16 | +For some users, maybe this isn't a problem, and for others, maybe it is, |
| 17 | +but for the majority of users, probably the question has never been |
| 18 | +posed and analyzed, and nobody thought much about it, probably assuming |
| 19 | +random is random is random. In other words, the unthinking expectation |
| 20 | +of most users is likely that the resultant numbers are uniform. |
| 21 | + |
| 22 | +So we implement here an efficient way of generating uniform bounded |
| 23 | +random integers. Through use of compile-time evaluation, and avoiding |
| 24 | +divisions as much as possible, this commit introduces no measurable |
| 25 | +overhead. At least for hot-path uses tested, any potential difference |
| 26 | +was lost in the noise. On both clang and gcc, code generation is pretty |
| 27 | +small. |
| 28 | + |
| 29 | +The new function, get_random_u32_below(), lives in random.h, rather than |
| 30 | +prandom.h, and has a "get_random_xxx" function name, because it is |
| 31 | +suitable for all uses, including cryptography. |
| 32 | + |
| 33 | +In order to be efficient, we implement a kernel-specific variant of |
| 34 | +Daniel Lemire's algorithm from "Fast Random Integer Generation in an |
| 35 | +Interval", linked below. The kernel's variant takes advantage of |
| 36 | +constant folding to avoid divisions entirely in the vast majority of |
| 37 | +cases, works on both 32-bit and 64-bit architectures, and requests a |
| 38 | +minimal amount of bytes from the RNG. |
| 39 | + |
| 40 | +Link: https://arxiv.org/pdf/1805.10941.pdf |
| 41 | +Cc: stable@vger.kernel.org # to ease future backports that use this api |
| 42 | +Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> |
| 43 | +--- |
| 44 | + drivers/char/random.c | 22 ++++++++++++++++++++++ |
| 45 | + include/linux/prandom.h | 18 ++---------------- |
| 46 | + include/linux/random.h | 40 ++++++++++++++++++++++++++++++++++++++++ |
| 47 | + 3 files changed, 64 insertions(+), 16 deletions(-) |
| 48 | + |
| 49 | +--- a/drivers/char/random.c |
| 50 | ++++ b/drivers/char/random.c |
| 51 | +@@ -199,6 +199,7 @@ static void __cold process_random_ready_ |
| 52 | + * |
| 53 | + * void get_random_bytes(void *buf, size_t len) |
| 54 | + * u32 get_random_u32() |
| 55 | ++ * u32 get_random_u32_below(u32 ceil) |
| 56 | + * u64 get_random_u64() |
| 57 | + * unsigned int get_random_int() |
| 58 | + * unsigned long get_random_long() |
| 59 | +@@ -553,6 +554,27 @@ EXPORT_SYMBOL(get_random_ ##type); |
| 60 | + DEFINE_BATCHED_ENTROPY(u64) |
| 61 | + DEFINE_BATCHED_ENTROPY(u32) |
| 62 | + |
| 63 | ++u32 __get_random_u32_below(u32 ceil) |
| 64 | ++{ |
| 65 | ++ /* |
| 66 | ++ * This is the slow path for variable ceil. It is still fast, most of |
| 67 | ++ * the time, by doing traditional reciprocal multiplication and |
| 68 | ++ * opportunistically comparing the lower half to ceil itself, before |
| 69 | ++ * falling back to computing a larger bound, and then rejecting samples |
| 70 | ++ * whose lower half would indicate a range indivisible by ceil. The use |
| 71 | ++ * of `-ceil % ceil` is analogous to `2^32 % ceil`, but is computable |
| 72 | ++ * in 32-bits. |
| 73 | ++ */ |
| 74 | ++ u64 mult = (u64)ceil * get_random_u32(); |
| 75 | ++ if (unlikely((u32)mult < ceil)) { |
| 76 | ++ u32 bound = -ceil % ceil; |
| 77 | ++ while (unlikely((u32)mult < bound)) |
| 78 | ++ mult = (u64)ceil * get_random_u32(); |
| 79 | ++ } |
| 80 | ++ return mult >> 32; |
| 81 | ++} |
| 82 | ++EXPORT_SYMBOL(__get_random_u32_below); |
| 83 | ++ |
| 84 | + #ifdef CONFIG_SMP |
| 85 | + /* |
| 86 | + * This function is called when the CPU is coming up, with entry |
| 87 | +--- a/include/linux/random.h |
| 88 | ++++ b/include/linux/random.h |
| 89 | +@@ -45,6 +45,46 @@ static inline unsigned long get_random_l |
| 90 | + #endif |
| 91 | + } |
| 92 | + |
| 93 | ++u32 __get_random_u32_below(u32 ceil); |
| 94 | ++ |
| 95 | ++/* |
| 96 | ++ * Returns a random integer in the interval [0, ceil), with uniform |
| 97 | ++ * distribution, suitable for all uses. Fastest when ceil is a constant, but |
| 98 | ++ * still fast for variable ceil as well. |
| 99 | ++ */ |
| 100 | ++static inline u32 get_random_u32_below(u32 ceil) |
| 101 | ++{ |
| 102 | ++ if (!__builtin_constant_p(ceil)) |
| 103 | ++ return __get_random_u32_below(ceil); |
| 104 | ++ |
| 105 | ++ /* |
| 106 | ++ * For the fast path, below, all operations on ceil are precomputed by |
| 107 | ++ * the compiler, so this incurs no overhead for checking pow2, doing |
| 108 | ++ * divisions, or branching based on integer size. The resultant |
| 109 | ++ * algorithm does traditional reciprocal multiplication (typically |
| 110 | ++ * optimized by the compiler into shifts and adds), rejecting samples |
| 111 | ++ * whose lower half would indicate a range indivisible by ceil. |
| 112 | ++ */ |
| 113 | ++ BUILD_BUG_ON_MSG(!ceil, "get_random_u32_below() must take ceil > 0"); |
| 114 | ++ if (ceil <= 1) |
| 115 | ++ return 0; |
| 116 | ++ for (;;) { |
| 117 | ++ if (ceil <= 1U << 8) { |
| 118 | ++ u32 mult = ceil * (get_random_u32() & 0xff); |
| 119 | ++ if (likely(is_power_of_2(ceil) || (u8)mult >= (1U << 8) % ceil)) |
| 120 | ++ return mult >> 8; |
| 121 | ++ } else if (ceil <= 1U << 16) { |
| 122 | ++ u32 mult = ceil * (get_random_u32() & 0xffff); |
| 123 | ++ if (likely(is_power_of_2(ceil) || (u16)mult >= (1U << 16) % ceil)) |
| 124 | ++ return mult >> 16; |
| 125 | ++ } else { |
| 126 | ++ u64 mult = (u64)ceil * get_random_u32(); |
| 127 | ++ if (likely(is_power_of_2(ceil) || (u32)mult >= -ceil % ceil)) |
| 128 | ++ return mult >> 32; |
| 129 | ++ } |
| 130 | ++ } |
| 131 | ++} |
| 132 | ++ |
| 133 | + /* |
| 134 | + * On 64-bit architectures, protect against non-terminated C string overflows |
| 135 | + * by zeroing out the first byte of the canary; this leaves 56 bits of entropy. |
0 commit comments