| // Copyright lowRISC contributors. |
| // Licensed under the Apache License, Version 2.0, see LICENSE for details. |
| // SPDX-License-Identifier: Apache-2.0 |
| |
| #include "sw/device/lib/crypto/aes_gcm/aes_gcm.h" |
| |
| #include <stddef.h> |
| #include <stdint.h> |
| |
| #include "sw/device/lib/base/macros.h" |
| #include "sw/device/lib/base/memory.h" |
| #include "sw/device/lib/crypto/drivers/aes.h" |
| |
| enum { |
| /* Log2 of the number of bytes in an AES block. */ |
| kAesBlockLog2NumBytes = 4, |
| }; |
| OT_ASSERT_ENUM_VALUE(kAesBlockNumBytes, 1 << kAesBlockLog2NumBytes); |
| |
| /** |
| * Precomputed modular reduction constants for Galois field multiplication. |
| * |
| * This implementation uses 4-bit windows. The bytes here represent 12-bit |
| * little-endian values. |
| * |
| * The entry with index i in this table is equal to i * 0xe1, where the bytes i |
| * and 0xe1 are interpreted as polynomials in the GCM Galois field. For |
| * example, 0xe1 = 0b1110_0001 is the degree-8 polynomial x^8 + x^2 + x + 1. |
| * |
| * The field modulus for GCM is 2^128 + x^8 + x^2 + x + 1. Therefore, if a |
| * field element is shifted, it can be reduced by multiplying the high bits |
| * (128 and above) by 0xe1 and adding them to the low bits. |
| * |
| * There is a size/speed tradeoff in window size. For 8-bit windows, GHASH |
| * becomes significantly faster, but the overhead for computing the product |
| * table of a given hash subkey becomes higher, so larger windows are slower |
| * for smaller inputs but faster for large inputs. |
| */ |
| static const uint16_t kGFReduceTable[16] = { |
| 0x0000, 0x201c, 0x4038, 0x6024, 0x8070, 0xa06c, 0xc048, 0xe054, |
| 0x00e1, 0x20fd, 0x40d9, 0x60c5, 0x8091, 0xa08d, 0xc0a9, 0xe0b5}; |
| |
| /** |
| * Product table for a given hash subkey. |
| * |
| * See `make_product_table`. |
| */ |
| typedef struct aes_gcm_product_table { |
| aes_block_t products[16]; |
| } aes_gcm_product_table_t; |
| |
| /** |
| * Performs a bitwise XOR of two blocks. |
| * |
| * This operation corresponds to addition in the Galois field. |
| * |
| * @param x First operand block |
| * @param y Second operand block |
| * @param out Buffer in which to store output; can be the same as one or both |
| * operands. |
| */ |
| static inline void block_xor(const aes_block_t *x, const aes_block_t *y, |
| aes_block_t *out) { |
| for (size_t i = 0; i < kAesBlockNumWords; ++i) { |
| out->data[i] = x->data[i] ^ y->data[i]; |
| } |
| } |
| |
| /** |
| * Reverse the bits of a 4-bit number. |
| * |
| * @param byte Input byte. |
| * @return byte with lower 4 bits reversed and upper 4 unmodified. |
| */ |
| static uint8_t reverse_bits(uint8_t byte) { |
| /* TODO: replace with rev.n (from 0.93 draft of bitmanip) once bitmanip |
| * extension is enabled. */ |
| uint8_t out = 0; |
| for (size_t i = 0; i < 4; ++i) { |
| out <<= 1; |
| out |= (byte >> i) & 1; |
| } |
| return out; |
| } |
| |
| /** |
| * Reverse the bytes of a 32-bit word. |
| * |
| * This function is convenient for converting between the processor's native |
| * integers, which are little-endian, and NIST's integer representation, which |
| * is big-endian. |
| * |
| * @param word Input word. |
| * @return Word with bytes reversed. |
| */ |
| static uint32_t reverse_bytes(uint32_t word) { |
| /* TODO: replace with rev8 once bitmanip extension is enabled. */ |
| uint32_t out = 0; |
| for (size_t i = 0; i < sizeof(uint32_t); ++i) { |
| out <<= 8; |
| out |= (word >> (i << 3)) & 255; |
| } |
| return out; |
| } |
| |
| /** |
| * Logical right shift of an AES block. |
| * |
| * NIST represents AES blocks as big-endian, and a "right shift" in that |
| * representation means shifting out the LSB. However, the processor is |
| * little-endian, so we need to reverse the bytes before shifting. |
| * |
| * @param block Input AES block, modified in-place |
| * @param nbits Number of bits to shift |
| */ |
| static inline void block_shiftr(aes_block_t *block, size_t nbits) { |
| // To perform a "right shift" with respect to the big-endian block |
| // representation, we traverse the words of the block and, for each word, |
| // reverse the bytes, save the new LSB as `carry`, shift right, set the MSB |
| // to the last block's carry, and then reverse the bytes back. |
| uint32_t carry = 0; |
| for (size_t i = 0; i < kAesBlockNumWords; ++i) { |
| uint32_t rev = reverse_bytes(block->data[i]); |
| uint32_t next_carry = rev & ((1 << nbits) - 1); |
| rev >>= nbits; |
| rev |= carry << (32 - nbits); |
| carry = next_carry; |
| block->data[i] = reverse_bytes(rev); |
| } |
| } |
| |
| /** |
| * Retrieve the byte at a given index from the AES block. |
| * |
| * @param block Input AES block |
| * @param index Index of byte to retrieve (must be < `kAesBlockNumBytes`) |
| * @return Value of block[index] |
| */ |
| static inline uint8_t block_byte_get(const aes_block_t *block, size_t index) { |
| return (uint8_t)((char *)block->data)[index]; |
| } |
| |
| /** |
| * Get the number of bytes past the last full block of the buffer. |
| * |
| * Equivalent to `sz % kAesBlockNumBytes`. |
| * |
| * @param sz Number of bytes to represent |
| * @return Offset of end of buffer from last full block |
| */ |
| static inline size_t get_block_offset(size_t sz) { |
| return sz & ((1 << kAesBlockLog2NumBytes) - 1); |
| } |
| |
| /** |
| * Get the minimum number of AES blocks needed to represent `sz` bytes. |
| * |
| * This routine does not run in constant time and should not be used for |
| * sensitive input values. |
| * |
| * @param sz Number of bytes to represent |
| * @return Minimum number of blocks needed |
| */ |
| static inline size_t get_nblocks(size_t sz) { |
| size_t out = sz >> kAesBlockLog2NumBytes; |
| if (get_block_offset(sz) != 0) { |
| out += 1; |
| } |
| return out; |
| } |
| |
| /** |
| * Increment a 32-bit big-endian word. |
| * |
| * This operation theoretically increments the "rightmost" (last) 32 bits of |
| * the input modulo 2^32. However, NIST treats all bitvectors as big-endian and |
| * the processor is little-endian. In order to increment we first need to |
| * reverse the bytes of the 32-bit word, then increment, then reverse it back. |
| * |
| * @param word Input word in big-endian form. |
| * @returns (word + 1) mod 2^32 in big-endian form. |
| */ |
| static inline uint32_t word_inc32(const uint32_t word) { |
| // Reverse the bytes, increment, and reverse back. |
| return reverse_bytes(reverse_bytes(word) + 1); |
| } |
| |
| /** |
| * Multiply an element of the GCM Galois field by the polynomial `x`. |
| * |
| * This corresponds to a shift right in the bit representation, and then |
| * reduction with the field modulus. |
| * |
| * Runs in constant time. |
| * |
| * @param p Polynomial to be multiplied |
| * @param out Buffer for output |
| */ |
| static inline void galois_mulx(const aes_block_t *p, aes_block_t *out) { |
| // Get the very rightmost bit of the input block (coefficient of x^127). |
| uint8_t p127 = block_byte_get(p, kAesBlockNumBytes - 1) & 1; |
| // Set the output to p >> 1. |
| memcpy(out->data, p->data, kAesBlockNumBytes); |
| block_shiftr(out, 1); |
| // If the highest coefficient was 1, then subtract the polynomial that |
| // corresponds to (modulus - 2^128). |
| uint32_t mask = 0 - p127; |
| out->data[0] ^= (0xe1 & mask); |
| } |
| |
| /** |
| * Construct a product table for the hash subkey H. |
| * |
| * The product table entry at index i is equal to i * H, where both i and H are |
| * interpreted as Galois field polynomials. |
| * |
| * @param H Hash subkey |
| * @param product_table Buffer for output |
| */ |
| static void make_product_table(const aes_block_t *H, |
| aes_gcm_product_table_t *tbl) { |
| // Initialize 0 * H = 0. |
| memset(tbl->products[0].data, 0, kAesBlockNumBytes); |
| // Initialize 1 * H = H. |
| memcpy(tbl->products[0x8].data, H->data, kAesBlockNumBytes); |
| |
| // To get remaining entries, we use a variant of "shift and add"; in |
| // polynomial terms, a shift is a multiplication by x. Note that, because the |
| // processor represents bytes with the MSB on the left and NIST uses a fully |
| // little-endian polynomial representation with the MSB on the right, we have |
| // to reverse the bits of the indices. |
| for (size_t i = 2; i < 16; i += 2) { |
| // Find the product corresponding to (i >> 1) * H and multiply by x to |
| // shift 1; this will be i * H. |
| galois_mulx(&tbl->products[reverse_bits(i >> 1)], |
| &tbl->products[reverse_bits(i)]); |
| |
| // Add H to i * H to get (i + 1) * H. |
| block_xor(&tbl->products[reverse_bits(i)], H, |
| &tbl->products[reverse_bits(i + 1)]); |
| } |
| } |
| |
| /** |
| * Computes the "product block" as defined in NIST SP800-38D, section 6.3. |
| * |
| * The NIST documentation shows a very slow double-and-add algorithm, which |
| * iterates through the first operand bit-by-bit. However, since the second |
| * operand is always the hash subkey H, we can speed things up significantly |
| * with a little precomputation. |
| * |
| * The `tbl` input should be a precomputed table with the products of H and |
| * 256 all possible byte values (see `make_product_table`). |
| * |
| * This operation corresponds to multiplication in the Galois field with order |
| * 2^128, modulo the polynomial x^128 + x^8 + x^2 + x + 1 |
| * |
| * @param p Polynomial to multiply |
| * @param tbl Precomputed product table for the hash subkey |
| * @param result Block in which to store output |
| */ |
| static void galois_mul(const aes_block_t *p, const aes_gcm_product_table_t *tbl, |
| aes_block_t *result) { |
| // Initialize the result to 0. |
| memset(result->data, 0, kAesBlockNumBytes); |
| |
| // 4-bit windows, so number of windows is 2x number of bytes. |
| const size_t kNumWindows = kAesBlockNumBytes << 1; |
| |
| // To compute the product, we iterate through the bytes of the input block, |
| // considering the most significant (in polynomial terms) first. For each |
| // byte b, we: |
| // * multiply `result` by x^8 (shift all coefficients to the right) |
| // * reduce the shifted `result` modulo the field modulus |
| // * look up the product `b * H` and add it to `result` |
| // |
| // We can skip the shift and reduce steps on the first iteration, since |
| // `result` is 0. |
| for (size_t i = 0; i < kNumWindows; ++i) { |
| if (i != 0) { |
| // Save the most significant half-byte of `result` before shifting. |
| uint8_t overflow = block_byte_get(result, kAesBlockNumBytes - 1) & 0x0f; |
| // Shift `result` to the right, discarding high bits. |
| block_shiftr(result, 4); |
| // Look up the product of `overflow` and the low terms of the modulus in |
| // the precomputed table. |
| uint16_t reduce_term = kGFReduceTable[overflow]; |
| // Add (xor) this product to the low bits to complete modular reduction. |
| // This works because (low + x^128 * high) is equivalent to (low + (x^128 |
| // - modulus) * high). |
| result->data[0] ^= reduce_term; |
| } |
| |
| // Add the product of the next window and H to `result`. We process the |
| // windows starting with the most significant polynomial terms, which means |
| // starting from the last byte and proceeding to the first. |
| uint8_t pi = block_byte_get(p, (kNumWindows - 1 - i) >> 1); |
| |
| // Select the less significant 4 bits if i is even, or the more significant |
| // 4 bits if i is odd. This does not need to be constant time, since the |
| // values of i in this loop are constant. |
| if ((i & 1) == 1) { |
| pi >>= 4; |
| } else { |
| pi &= 0x0f; |
| } |
| block_xor(result, &tbl->products[pi], result); |
| } |
| } |
| |
| /** |
| * Advances the GHASH state for the given input. |
| * |
| * This function is convenient for computing the GHASH of large, concatenated |
| * buffers. Given a state representing GHASH(A) and a new input B, this |
| * function will return a state representing GHASH(A || B || <padding for B>). |
| * |
| * Pads the input with 0s on the right-hand side if needed so that the input |
| * size is a multiple of the block size. |
| * |
| * The product table should match the format from `make_product_table`. |
| * |
| * @param tbl Precomputed product table |
| * @param input_len Length of input in bytes |
| * @param input Input buffer |
| * @param state GHASH state, updated in place |
| */ |
| static void aes_gcm_ghash_update(const aes_gcm_product_table_t *tbl, |
| const size_t input_len, const uint8_t *input, |
| aes_block_t *state) { |
| // Calculate the number of AES blocks needed for the input. |
| size_t nblocks = get_nblocks(input_len); |
| |
| // Main loop. |
| for (size_t i = 0; i < nblocks; ++i) { |
| // Construct block i of the input. |
| aes_block_t input_block; |
| if ((i == nblocks - 1) && (get_block_offset(input_len) != 0)) { |
| // Last block is not full; pad with zeroes. |
| size_t nbytes = get_block_offset(input_len); |
| memset(input_block.data, 0, kAesBlockNumBytes); |
| memcpy(input_block.data, input, nbytes); |
| } else { |
| // Full block; copy data and advance input pointer. |
| memcpy(input_block.data, input, kAesBlockNumBytes); |
| input += kAesBlockNumBytes; |
| } |
| // XOR `state` with the next input block. |
| block_xor(state, &input_block, state); |
| // Multiply state by H and update. |
| aes_block_t yH; |
| galois_mul(state, tbl, &yH); |
| memcpy(state->data, yH.data, kAesBlockNumBytes); |
| } |
| } |
| |
| aes_error_t aes_gcm_ghash(const aes_block_t *hash_subkey, |
| const size_t input_len, const uint8_t *input, |
| aes_block_t *output) { |
| // Compute the product table for H. |
| aes_gcm_product_table_t tbl; |
| make_product_table(hash_subkey, &tbl); |
| |
| // If the input length is not a multiple of the block size, fail. |
| if (get_block_offset(input_len) != 0) { |
| return kAesInternalError; |
| } |
| |
| // Initialize the GHASH state to 0. |
| memset(output->data, 0, kAesBlockNumBytes); |
| |
| // Update the GHASH state with the input. |
| aes_gcm_ghash_update(&tbl, input_len, input, output); |
| |
| return kAesOk; |
| } |
| |
| /** |
| * One-shot version of the AES API for a single block, for convenience. |
| */ |
| static aes_error_t aes_single_block(const aes_params_t params, |
| const aes_block_t *input, |
| aes_block_t *output) { |
| aes_error_t err = aes_begin(params); |
| if (err != kAesOk) { |
| return err; |
| } |
| err = aes_update(output, input); |
| if (err != kAesOk) { |
| return err; |
| } |
| err = aes_end(); |
| if (err != kAesOk) { |
| return err; |
| } |
| return kAesOk; |
| } |
| |
| /** |
| * Implements the GCTR function as specified in SP800-38D, section 6.5. |
| * |
| * The block cipher is fixed to AES. Note that the GCTR function is a modified |
| * version of the AES-CTR mode of encryption. |
| * |
| * Input must be less than 2^32 blocks long; that is, `len` < 2^36, since each |
| * block is 16 bytes. |
| * |
| * @param key_len length of the AES key |
| * @param key_shares key, expressed in two shares |
| * @param icb Initial counter block, 128 bits |
| * @param len Number of bytes for input and output |
| * @param input Pointer to input buffer (may be NULL if `len` is 0) |
| * @param output Pointer to output buffer (same size as input, may be the same |
| * buffer) |
| */ |
| aes_error_t aes_gcm_gctr(const aes_key_len_t key_len, uint32_t *key_shares[2], |
| const aes_block_t *icb, const size_t len, |
| const uint8_t *input, uint8_t *output) { |
| // If the input is empty, the output must be as well. Since the output length |
| // is 0, simply return. |
| if (len == 0) { |
| return kAesOk; |
| } |
| |
| // NULL pointers are not allowed for nonzero input length. |
| if (input == NULL || output == NULL) { |
| return kAesInternalError; |
| } |
| |
| // Prepare AES parameters for AES-CTR encryption. Encrypting a single block B |
| // with these parameters will return AES_K(IV) ^ B, where AES_K is the core |
| // AES block cipher encryption operation using key K. |
| aes_params_t aes_ctr_params = { |
| .encrypt = true, |
| .mode = kAesCipherModeCtr, |
| .key_len = key_len, |
| .key = {key_shares[0], key_shares[1]}, |
| .iv = {0}, |
| }; |
| memcpy(aes_ctr_params.iv, icb->data, kAesBlockNumBytes); |
| |
| // Calculate the number of blocks needed to represent the input. Since we |
| // checked for empty input above, nblocks must be at least 1. |
| size_t nblocks = get_nblocks(len); |
| for (size_t i = 0; i < nblocks; ++i) { |
| // Retrieve the next block of input. All blocks are full-size except for |
| // the last block, which may be partial. If the block is partial, the input |
| // data will be padded with zeroes. |
| aes_block_t block_in = {.data = {0}}; |
| size_t nbytes = kAesBlockNumBytes; |
| if ((i == nblocks - 1) && (get_block_offset(len) != 0)) { |
| // Last block is partial; copy over only the bytes that exist. |
| nbytes = get_block_offset(len); |
| memcpy(block_in.data, &input[i * kAesBlockNumBytes], nbytes); |
| } else { |
| // This block is a full block. |
| memcpy(block_in.data, &input[i * kAesBlockNumBytes], kAesBlockNumBytes); |
| } |
| |
| // Allocate a buffer for the cipher output. |
| aes_block_t block_out = {.data = {0}}; |
| |
| // Run the AES-CTR encryption operation on the next block of input. |
| aes_error_t err = aes_single_block(aes_ctr_params, &block_in, &block_out); |
| if (err != kAesOk) { |
| return err; |
| } |
| |
| // Copy the result block into the output buffer, truncating some bytes if |
| // the input block was partial. |
| memcpy(&output[i * kAesBlockNumBytes], block_out.data, nbytes); |
| |
| // Update the counter block. |
| aes_ctr_params.iv[kAesBlockNumWords - 1] = |
| word_inc32(aes_ctr_params.iv[kAesBlockNumWords - 1]); |
| } |
| |
| return kAesOk; |
| } |