| /* Copyright lowRISC contributors. */ |
| /* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ |
| /* SPDX-License-Identifier: Apache-2.0 */ |
| |
| /** |
| * This library contains arithmetic for the finite field modulo the prime |
| * p = 2^255-19, which is used for both the X25519 key exchange and the Ed25519 |
| * signature scheme. |
| */ |
| |
| |
| /** |
| * Fully reduce a 510-bit number modulo p. |
| * |
| * Returns c = a mod p. |
| * |
| * Uses Solinas/pseudo-Mersenne reduction, which is based on the observation |
| * that if a large number x is split into two so that the lower 255 bits are x0 |
| * and all bits above 255 are x1, then: |
| * |
| * x = x0 + (2^255 * x1) \equiv (x0 + (19 * x1)) (mod p) |
| * |
| * If x is large, then x0 + 19 * x1 will be much smaller than x, because 19 << |
| * 2^255. This step can be repeated as necessary until a conditional |
| * subtraction is enough to fully reduce. |
| * |
| * Note about register notations: in this code, [a:b] indicates that the |
| * registers are adjacent and their contents can be logically concatenated to |
| * get a single larger value. Otherwise, the notation is [a,b]. |
| * |
| * This routine runs in constant time. |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * @param[in] w19: constant, w19 = 19 |
| * @param[in] [w21:w20]: a, number to be reduced (a < 2^510) |
| * @param[in] MOD: p, modulus = 2^255 - 19 |
| * @param[in] w31: all-zero |
| * @param[out] w22: c, result = a mod p |
| * |
| * clobbered registers: w18, w20 to w22 |
| * clobbered flag groups: FG0 |
| */ |
| fe_reduce: |
| /* First Solinas step (reducing modulo 2p = 2^256-38). */ |
| |
| /* Multiply the high bits by 38 (fits in 64bx256b multiply). Note that the |
| last multiplication result is zero; it exists only to write back the |
| accumulator from previous multiplies. |
| w18,w22 <= 19 * (2 * w21) = 38 * a[509:256] */ |
| bn.add w18, w21, w21 |
| bn.mulqacc.z w19.0, w18.0, 0 |
| bn.mulqacc.so w22.L, w19.0, w18.1, 64 |
| bn.mulqacc w19.0, w18.2, 0 |
| bn.mulqacc.so w22.U, w19.0, w18.3, 64 |
| bn.mulqacc.wo w18, w19.0, w31.0, 0 |
| |
| /* Add to low bits. |
| [w21:w20] <= w20 + [w18,w22] = a[255:0] + (38 * a[509:256]) = r1 */ |
| bn.add w20, w20, w22 |
| bn.addc w21, w31, w18 |
| |
| /* First Solinas step is complete. At this point, the new intermediate result |
| r1 is at most 261 bits, because: |
| a[509:256] = 254 bits |
| 38 * a[509:256] = 260 bits |
| a[255:0] + 38 * a[509:256] = 261 bits */ |
| |
| /* Begin second Solinas step (reducing by p = 2^255 - 19 this time). */ |
| |
| /* Extract the high 6 bits. |
| w21 <= [w21:w20] >> 255 = r1[260:255] */ |
| bn.rshi w21, w21, w20 >> 255 |
| |
| /* Extract the low 255 bits. |
| w20 <= r1[254:0] */ |
| bn.rshi w20, w20, w31 >> 255 |
| bn.rshi w20, w31, w20 >> 1 |
| |
| /* Multiply the high bits by 19 (fits in 64bx64b multiply). |
| w21 <= w19 * w21 = 19 * r1[260:255] */ |
| bn.mulqacc.wo.z w21, w19.0, w21.0, 0 |
| |
| /* Add to low bits (guaranteed by bounds not to carry). |
| w20 <= r1[254:0] + (19 * r1[260:255]) = r2 */ |
| bn.add w20, w20, w21 |
| |
| /* Second Solinas step is complete. At this point, we know r2 < 2 * p, |
| because of bounds implied by bit lengths: |
| r1[254:0] + 19 * r1[260:255] <= 2^255 - 1 + 19 * (2^6 - 1) < 2 * p */ |
| |
| /* Conditionally subtract modulus if p <= r2. |
| w22 <= r2 mod p = a mod p */ |
| bn.addm w22, w20, w31 |
| |
| ret |
| |
| /** |
| * Multiply two field elements and reduce modulo p. |
| * |
| * Returns c = (a * b) mod p. |
| * |
| * The inputs a and b must be at most 2^255 bits, although it is not necessary |
| * for them to be fully reduced modulo p. This routine runs in constant time. |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * @param[in] w19: constant, w19 = 19 |
| * @param[in] w22: a, first operand, a < 2^255 |
| * @param[in] w23: b, second operand, b < 2^255 |
| * @param[in] MOD: p, modulus = 2^255 - 19 |
| * @param[in] w31: all-zero |
| * @param[out] w22: c, result |
| * |
| * clobbered registers: w18, w20 to w22 |
| * clobbered flag groups: FG0 |
| */ |
| .globl fe_mul |
| fe_mul: |
| /* Compute the raw 510-bit product. |
| [w21:w20] <= a * b */ |
| bn.mulqacc.z w22.0, w23.0, 0 |
| bn.mulqacc w22.1, w23.0, 64 |
| bn.mulqacc.so w20.L, w22.0, w23.1, 64 |
| bn.mulqacc w22.2, w23.0, 0 |
| bn.mulqacc w22.1, w23.1, 0 |
| bn.mulqacc w22.0, w23.2, 0 |
| bn.mulqacc w22.3, w23.0, 64 |
| bn.mulqacc w22.2, w23.1, 64 |
| bn.mulqacc w22.1, w23.2, 64 |
| bn.mulqacc.so w20.U, w22.0, w23.3, 64 |
| bn.mulqacc w22.3, w23.1, 0 |
| bn.mulqacc w22.2, w23.2, 0 |
| bn.mulqacc w22.1, w23.3, 0 |
| bn.mulqacc w22.3, w23.2, 64 |
| bn.mulqacc.so w21.L, w22.2, w23.3, 64 |
| bn.mulqacc.so w21.U, w22.3, w23.3, 0 |
| |
| /* Reduce modulo p. |
| w22 <= [w21:w20] mod p = (a * b) mod p */ |
| jal x1, fe_reduce |
| |
| ret |
| |
| /** |
| * Square a field element and reduce modulo p. |
| * |
| * Returns c = (a * a) mod p. |
| * |
| * The input a must be at most 2^255 bits, although it is not necessary for it |
| * to be fully reduced modulo p. This specialized squaring procedure is |
| * slightly faster than fe_mul, because duplicated partial products can be |
| * multiplied by two instead of being computed separately. By optimizing for |
| * this special case, we can use 10 multiplications and 4 additions instead of |
| * 16 multiplications to compute the raw product. |
| * |
| * Note that this is only 2 instructions faster than fe_mul; if we need to cut |
| * down on code size, we could try not using a specialized square. However, |
| * this routine is called many, many times (especially in the inverse |
| * computation) so those two instructions might add up to quite a bit in the |
| * end. |
| * |
| * This routine runs in constant time. |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * @param[in] w19: constant, w19 = 19 |
| * @param[in] w22: a, operand, a < 2^255 |
| * @param[in] MOD: p, modulus = 2^255 - 19 |
| * @param[in] w31: all-zero |
| * @param[out] w22: c, result |
| * |
| * clobbered registers: w17, w18, w20 to w22 |
| * clobbered flag groups: FG0 |
| */ |
| .globl fe_square |
| fe_square: |
| /* Compute the partial products that do NOT need to be multiplied by 2. |
| [w21:w20] <= a0^2 + (a1^2 << 128) + (a2^2 << 256) + (a3^2 << 384) */ |
| bn.mulqacc.so.z w20.L, w22.0, w22.0, 0 |
| bn.mulqacc.so w20.U, w22.1, w22.1, 0 |
| bn.mulqacc.so w21.L, w22.2, w22.2, 0 |
| bn.mulqacc.so w21.U, w22.3, w22.3, 0 |
| |
| /* Compute the partial products that do need to be multiplied by 2. |
| [w18:w17] <= (a0a1 << 64) + (a0a2 << 128) + (a0a3 << 192) |
| + (a1a2 << 192) + (a1a3 << 256) |
| + (a2a3 << 320) */ |
| bn.mulqacc.so.z w17.L, w22.0, w22.1, 64 |
| bn.mulqacc w22.0, w22.2, 0 |
| bn.mulqacc w22.0, w22.3, 64 |
| bn.mulqacc.so w17.U, w22.1, w22.2, 64 |
| bn.mulqacc w22.1, w22.3, 0 |
| bn.mulqacc.wo w18, w22.2, w22.3, 64 |
| |
| /* Add the partial products. |
| [w21:w20] <= [w21:w20] + [w18:w17] * 2 = a * a */ |
| bn.add w20, w20, w17 |
| bn.addc w21, w21, w18 |
| bn.add w20, w20, w17 |
| bn.addc w21, w21, w18 |
| |
| /* Reduce modulo p. |
| w22 <= [w21:w20] mod p = (a * a) mod p */ |
| jal x1, fe_reduce |
| |
| ret |
| |
| /** |
| * Compute the inverse of an element in the finite field modulo (2^255-19). |
| * |
| * Returns c = (a^(-1)) mod p. |
| * |
| * Uses Fermat's Little Theorem, which states that for any nonzero element a of |
| * the finite field modulo a prime p, then a^(p-1) mod p = 1. A corrolary of |
| * this theorem is that (a * (a^(p-2))) mod p = 1, so a^(p-2) is a |
| * multiplicative inverse of a modulo p. |
| * |
| * To compute a^(p-2), we use a square-and-multiply algorithm. This chain of |
| * squares and multiplies is modified from curve25519-donna |
| * (https://github.com/agl/curve25519-donna/blob/f7837adf95a2c2dcc36233cb02a1fb34081c0c4a/curve25519-donna-c64.c#L403), |
| * which is in turn a modified version of the (qhasm) reference implementation |
| * published with the original paper. |
| * |
| * The main difference between this implementation and donna is that we attempt |
| * to minimize bn.mov instructions by making sure multiplies/squares always use |
| * the same temporary variables/registers. |
| * |
| * This routine runs in constant time. |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * @param[in] w19: constant, w19 = 19 |
| * @param[in] w16: a, first operand, a < p |
| * @param[in] MOD: p, modulus = 2^255 - 19 |
| * @param[in] w31: all-zero |
| * @param[out] w22: c, result |
| * |
| * clobbered registers: w14, w15, w17, w18, w20 to w23 |
| * clobbered flag groups: FG0 |
| */ |
| .globl fe_inv |
| fe_inv: |
| /* w22 <= w16^2 = a^2 */ |
| bn.mov w22, w16 |
| jal x1, fe_square |
| /* w23 <= w22 = a^2 */ |
| bn.mov w23, w22 |
| |
| /* w22 <= w22^4 = a^8 */ |
| jal x1, fe_square |
| jal x1, fe_square |
| |
| /* w22 <= w22 * w23 = a^10 */ |
| jal x1, fe_mul |
| /* w15 <= w22 = a^10 */ |
| bn.mov w15, w22 |
| |
| /* w22 <= w22 * w16 = a^11 */ |
| bn.mov w23, w16 |
| jal x1, fe_mul |
| /* w14 <= w22 <= a^11 */ |
| bn.mov w14, w22 |
| |
| /* w22 <= w15^2 = a^20 */ |
| bn.mov w22, w15 |
| jal x1, fe_square |
| |
| /* w22 <= w22 * w14 = a^31 = a^(2^5 - 1) */ |
| bn.mov w23, w14 |
| jal x1, fe_mul |
| /* w23 <= w22 = a^(2^5 - 1) */ |
| bn.mov w23, w22 |
| |
| /* w22 <= w22^(2^5) = a^(2^10-2^5) */ |
| loopi 5,2 |
| jal x1, fe_square |
| nop |
| |
| /* w22 <= w22 * w23 = a^(2^10-1) */ |
| jal x1, fe_mul |
| /* w23 <= w22 <= a^(2^10-1) */ |
| bn.mov w23, w22 |
| /* w15 <= w22 <= a^(2^10-1) */ |
| bn.mov w15, w22 |
| |
| /* w22 <= w22^(2^10) = a^(2^20-2^10) */ |
| loopi 10,2 |
| jal x1, fe_square |
| nop |
| |
| /* w22 <= w22 * w23 = a^(2^20-1) */ |
| jal x1, fe_mul |
| /* w23 <= w22 <= a^(2^20-1) */ |
| bn.mov w23, w22 |
| |
| /* w22 <= w22^(2^20) = a^(2^40-2^20) */ |
| loopi 20,2 |
| jal x1, fe_square |
| nop |
| |
| /* w22 <= w22 * w23 = a^(2^40-1) */ |
| jal x1, fe_mul |
| |
| /* w22 <= w22^(2^10) = a^(2^50-2^10) */ |
| loopi 10,2 |
| jal x1, fe_square |
| nop |
| |
| /* w22 <= w22 * w15 = a^(2^50-1) */ |
| bn.mov w23, w15 |
| jal x1, fe_mul |
| /* w23 <= w22 <= a^(2^50-1) */ |
| bn.mov w23, w22 |
| /* w15 <= w22 <= a^(2^50-1) */ |
| bn.mov w15, w22 |
| |
| /* w22 <= w22^(2^50) = a^(2^100-2^50) */ |
| loopi 50,2 |
| jal x1, fe_square |
| nop |
| |
| /* w22 <= w22 * w23 = a^(2^100-1) */ |
| jal x1, fe_mul |
| /* w23 <= w22 <= a^(2^100-1) */ |
| bn.mov w23, w22 |
| |
| /* w22 <= w22^(2^100) = a^(2^200-2^100) */ |
| loopi 100,2 |
| jal x1, fe_square |
| nop |
| |
| /* w22 <= w22 * w23 = a^(2^200-1) */ |
| jal x1, fe_mul |
| |
| /* w22 <= w22^(2^50) = a^(2^250-2^50) */ |
| loopi 50,2 |
| jal x1, fe_square |
| nop |
| |
| /* w22 <= w22 * w15 = a^(2^250-1) */ |
| bn.mov w23, w15 |
| jal x1, fe_mul |
| |
| /* w22 <= w22^(2^5) = a^(2^255-2^5) */ |
| loopi 5,2 |
| jal x1, fe_square |
| nop |
| |
| /* w22 <= w22 * w14 = a^(2^255 - 2^5 + 11) = a^(2^255 - 21) = a^(p-2) */ |
| bn.mov w23, w14 |
| jal x1, fe_mul |
| |
| ret |