| /* Copyright lowRISC contributors. */ |
| /* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ |
| /* SPDX-License-Identifier: Apache-2.0 */ |
| |
| /** |
| * This library contains code for X25519, for use in elliptic-curve Diffie |
| * Hellman key exchange. |
| * |
| * Helpful references: |
| * RFC: https://datatracker.ietf.org/doc/html/rfc7748 |
| * Academic paper: https://www.iacr.org/cryptodb/archive/2006/PKC/3351/3351.pdf |
| */ |
| |
| /** |
| * Top-level X25519 function, as defined in RFC 7748. |
| * |
| * This routine runs in constant time. |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * @param[in] w8: enc(k), encoded scalar (256-bit random number) |
| * @param[in] w9: enc(u), encoded Montgomery u-coordinate (256 bits) |
| * @param[out] w22: result, X25519(k, u) as an encoded u-coordinate |
| * |
| * clobbered registers: w2 to w24 |
| * clobbered flag groups: FG0 |
| */ |
| .globl X25519 |
| X25519: |
| /* Prepare all-zero register. */ |
| bn.xor w31, w31, w31 |
| /* Prepare constant for field operations. |
| w19 <= 19 */ |
| bn.addi w19, w31, 19 |
| |
| /* Load modulus from DMEM. |
| MOD <= dmem[modulus25519] = 2^255 - 19 = p */ |
| li x2, 2 |
| la x3, modulus25519 |
| bn.lid x2, 0(x3) |
| bn.wsrw 0x0, w2 |
| |
| /* Decode scalar. From RFC 7748, section 5: |
| |
| "For X25519, in order to decode 32 random bytes as an integer scalar, set |
| the three least significant bits of the first byte and the most significant bit |
| of the last to zero, set the second most significant bit of the last byte to 1 |
| and, finally, decode as little-endian." */ |
| |
| /* w8 <= (w8 >> 3) = enc(k) >> 3 */ |
| bn.rshi w8, w31, w8 >> 3 |
| /* w8 <= w8 << 5 = ((enc(k) >> 3) << 5) mod 2^256 */ |
| bn.rshi w8, w8, w31 >> 251 |
| /* w8 <= 2^254 + (w8 >> 2) = ((enc(k) >> 3) << 3) mod 2^254 + 2^254 = k*/ |
| bn.addi w7, w31, 1 |
| bn.rshi w8, w7, w8 >> 2 |
| |
| /* Decode point u-coordinate. As specified in RFC 7748, section 5, we must |
| zero the most significant bit and accept non-canonical values (i.e. cases |
| where p <= u) as if they had been reduced modulo p. */ |
| |
| /* w7 <= ~w31 = 2^256 - 1 */ |
| bn.not w7, w31 |
| /* w7 <= w7 >> 1 = 2^255 - 1 */ |
| bn.rshi w7, w31, w7 >> 1 |
| /* w9 <= w9 & w7 = enc(u) mod 2^255 */ |
| bn.and w9, w9, w7 |
| /* w9 <= w9 mod p <= (enc(u) mod 2^255) mod p = u */ |
| bn.addm w9, w9, w31 |
| |
| /* Note that, in contrast to Ed25519, the given point does not have to be |
| validated; see the original Curve25519 paper for discussion (search |
| for "small-subgroup attacks" and "free key validation"). |
| https://www.iacr.org/cryptodb/archive/2006/PKC/3351/3351.pdf */ |
| |
| /* Perform scalar multiplication. |
| w22 <= scalar_mult(k, u) = X25519(k, u) */ |
| jal x1, scalar_mult |
| |
| /* Since the scalar multiplication routine has already reduced its result |
| modulo p and OTBN is little-endian, encoding the result is a no-op. */ |
| |
| ret |
| |
| /** |
| * Elliptic-curve scalar multiplication for Curve25519. |
| * |
| * Returns B = kA. |
| * |
| * Multiplies a point on the curve (A) with a scalar (k). Uses the Montgomery |
| * ladder algorithm as in RFC 7748. For the Montgomery ladder, we only need to |
| * consider the u-coordinate of curve points. |
| * |
| * See RFC 7748, section 5: |
| * https://datatracker.ietf.org/doc/html/rfc7748#section-5 |
| * |
| * This routine runs in constant time. |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * @param[in] w8: k, 255-bit scalar value |
| * @param[in] w9: u, Montgomery u-coordinate of point A (k < p) |
| * @param[in] w19: constant, w19 = 19 |
| * @param[in] w31, all-zero |
| * @param[in] MOD: p, modulus = 2^255 - 19 |
| * @param[out] w22: result, Montgomery u-coordinate of point kA |
| * |
| * clobbered registers: w2 to w7, w10 to w18, w20 to w24 |
| * clobbered flag groups: FG0 |
| */ |
| scalar_mult: |
| |
| /* Load the constant a24: |
| w24 <= dmem[a24] = 121665 */ |
| li x2, 24 |
| la x3, a24 |
| bn.lid x2, 0(x3) |
| |
| /* Set initial values. From RFC 7748: |
| |
| x_2 = 1 |
| z_2 = 0 |
| x_3 = u |
| z_3 = 1 |
| swap = 0 |
| */ |
| |
| /* x_2 = w10 <= 1 */ |
| bn.addi w10, w31, 1 |
| /* z_2 = w12 <= 0 */ |
| bn.mov w12, w31 |
| /* x_3 = w11 <= u */ |
| bn.mov w11, w9 |
| /* z_3 = w13 <= 1 */ |
| bn.addi w13, w31, 1 |
| /* swap = w15 <= 0 */ |
| bn.mov w15, w31 |
| |
| /* Initialize w4 for loop. |
| w14 <= (k << 1) mod 2^256 */ |
| bn.rshi w14, w8, w31 >> 255 |
| |
| /* Main loop. From RFC 7748 (with `ladderstep` factored into a separate step): |
| |
| For t = bits-1 down to 0: |
| k_t = (k >> t) & 1 |
| swap ^= k_t |
| (x_2, x_3) = cswap(swap, x_2, x_3) |
| (z_2, z_3) = cswap(swap, z_2, z_3) |
| swap = k_t |
| |
| ladderstep(u, x_2, x_3, z_2, z_3) |
| |
| Loop invariants: |
| w14 = (k << (255-t)) mod 2^256 |
| w9 = u |
| |
| Loop temporary registers: w6, w7 |
| */ |
| loopi 255, 12 |
| /* w7 <= w14 >> 255 = (k << (255-t)) mod 2^256 >> 255 = k[t] = k_t */ |
| bn.rshi w7, w31, w14 >> 255 |
| /* w14 <= w14 << 1 = (k << (255-(t-1)) mod 2^256 */ |
| bn.rshi w14, w14, w31 >> 255 |
| /* swap = w15 = w15 ^ w7 = swap ^ k_t */ |
| bn.xor w15, w15, w7 |
| |
| /* Note: The xor above set FG0.Z if and only if swap ^ k_t is zero. */ |
| |
| /* Conditionally swap x_2 and x_3. */ |
| |
| /* w6 <= if swap =? 0 then x_2 else x_3 */ |
| bn.sel w6, w10, w11, FG0.Z |
| /* x_3 = w11 <= if swap =? 0 then x_3 else x_2 */ |
| bn.sel w11, w11, w10, FG0.Z |
| /* x_2 = w10 <= w6 = if swap =? 0 then x_2 else x_3 */ |
| bn.mov w10, w6 |
| |
| /* Conditionally swap z_2 and z_3. */ |
| |
| /* w6 <= if swap =? 0 then z_2 else z_3 */ |
| bn.sel w6, w12, w13, FG0.Z |
| /* z_3 = w13 <= if swap =? 0 then z_3 else z_2 */ |
| bn.sel w13, w13, w12, FG0.Z |
| /* z_2 = w12 <= w6 = if swap =? 0 then z_2 else z_3 */ |
| bn.mov w12, w6 |
| |
| /* swap = w15 <= w7 = k_t */ |
| bn.mov w15, w7 |
| |
| jal x1, ladderstep |
| nop |
| |
| /* Loop done; final steps. From RFC 7748: |
| |
| (x_2, x_3) = cswap(swap, x_2, x_3) |
| (z_2, z_3) = cswap(swap, z_2, z_3) |
| Return x_2 * (z_2^(p - 2)) |
| */ |
| |
| /* Set the FG0.Z flag if swap (w15) is 0. */ |
| bn.add w15, w15, w31 |
| |
| /* Conditionally swap x_2 and x_3. */ |
| |
| /* w6 <= if swap =? 0 then x_2 else x_3 */ |
| bn.sel w6, w10, w11, FG0.Z |
| /* x_3 = w11 <= if swap =? 0 then x_3 else x_2 */ |
| bn.sel w11, w11, w10, FG0.Z |
| /* x_2 = w10 <= w6 = if swap =? 0 then x_2 else x_3 */ |
| bn.mov w10, w6 |
| |
| /* Conditionally swap z_2 and z_3. */ |
| |
| /* w6 <= if swap =? 0 then z_2 else z_3 */ |
| bn.sel w6, w12, w13, FG0.Z |
| /* z_3 = w13 <= if swap =? 0 then z_3 else z_2 */ |
| bn.sel w13, w13, w12, FG0.Z |
| /* z_2 = w12 <= w6 = if swap =? 0 then z_2 else z_3 */ |
| bn.mov w12, w6 |
| |
| /* w22 <= fe_inv(w12) = z_2^(p-2) */ |
| bn.mov w16, w12 |
| jal x1, fe_inv |
| /* w22 <= w22 * w10 = (z_2^(p-2)) * x_2 */ |
| bn.mov w23, w10 |
| jal x1, fe_mul |
| |
| ret |
| |
| /** |
| * Performs the core arithmetic step of the Montgomery ladder. |
| * |
| * This subroutine is intended to be used as part of a Montgomery ladder |
| * implementation. It performs the core arithmetic in the main loop, i.e. the |
| * following steps (from RFC 7748, section 5): |
| * A = x_2 + z_2 |
| * AA = A^2 |
| * B = x_2 - z_2 |
| * BB = B^2 |
| * E = AA - BB |
| * C = x_3 + z_3 |
| * D = x_3 - z_3 |
| * DA = D * A |
| * CB = C * B |
| * x_3 = (DA + CB)^2 |
| * z_3 = x_1 * (DA - CB)^2 |
| * x_2 = AA * BB |
| * z_2 = E * (AA + a24 * E) |
| * |
| * All arithmetic (+, -, *, ^) above is in the finite field GF(2^255-19). |
| * |
| * TODO: investigate alternative laddersteps, such as the one in |
| * curve25519-donna, which may be faster. |
| * |
| * This routine runs in constant time. |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * @param[in] w9: x_1, Montgomery u-coordinate of point A (k < p) |
| * @param[in] w19: constant, w19 = 19 |
| * @param[in] w24: constant a24 = 121665 |
| * @param[in] w31: all-zero |
| * @param[in] MOD: p, modulus = 2^255 - 19 |
| * @param[in,out] w10: x_2 |
| * @param[in,out] w11: x_3 |
| * @param[in,out] w12: z_2 |
| * @param[in,out] w13: z_3 |
| * |
| * clobbered registers: w2 to w5, w10 to w13, w17, w18, w20 to w23 |
| * clobbered flag groups: FG0 |
| */ |
| ladderstep: |
| /* First, compute the new x_2 and z_2: |
| A = x_2 + z_2 |
| AA = A^2 |
| B = x_2 - z_2 |
| BB = B^2 |
| E = AA - BB |
| x_2 = AA * BB |
| z_2 = E * (AA + a24 * E) */ |
| |
| /* w2 <= w10 + w12 = x_2 + z_2 = A */ |
| bn.addm w2, w10, w12 |
| /* w3 <= w2^2 = A^2 = AA */ |
| bn.mov w22, w2 |
| jal x1, fe_square |
| bn.mov w3, w22 |
| /* w4 <= w10 - w12 = x_2 - z_2 = B */ |
| bn.subm w4, w10, w12 |
| /* w22 <= w4^2 = B^2 = BB */ |
| bn.mov w22, w4 |
| jal x1, fe_square |
| /* w5 <= w3 - w22 = AA - BB = E */ |
| bn.subm w5, w3, w22 |
| /* x_2 = w10 <= w22 * w3 = BB * AA */ |
| bn.mov w23, w3 |
| jal x1, fe_mul |
| bn.mov w10, w22 |
| /* w23 <= w5 = E */ |
| bn.mov w23, w5 |
| /* TODO: create faster, specialized fe_mul for a24 */ |
| /* w22 <= w24 * w23 = a24 * E */ |
| bn.mov w22, w24 |
| jal x1, fe_mul |
| /* w22 <= w3 + w22 = AA + (a24 * E) */ |
| bn.add w22, w3, w22 |
| /* z_2 = w12 <= w22 * w23 = (AA + a24 * E) * E */ |
| jal x1, fe_mul |
| bn.mov w12, w22 |
| |
| /* Now, compute the new x_3 and z_3, using A and B from the computation |
| above: |
| C = x_3 + z_3 |
| D = x_3 - z_3 |
| DA = D * A |
| CB = C * B |
| x_3 = (DA + CB)^2 |
| z_3 = x_1 * (DA - CB)^2 */ |
| |
| /* w22 <= w11 + w13 = x_3 + z_3 = C */ |
| bn.addm w22, w11, w13 |
| /* w4 <= w22 * w4 = C * B = CB */ |
| bn.mov w23, w4 |
| jal x1, fe_mul |
| bn.mov w4, w22 |
| /* w22 <= w11 - w13 = x_3 - z_3 = D */ |
| bn.subm w22, w11, w13 |
| /* w2 <= w22 * w2 = D * A = DA */ |
| bn.mov w23, w2 |
| jal x1, fe_mul |
| bn.mov w2, w22 |
| /* w22 <= w2 + w4 <= DA + CB */ |
| bn.addm w22, w2, w4 |
| /* x_3 = w11 <= w22^2 = (DA + CB)^2 */ |
| jal x1, fe_square |
| bn.mov w11, w22 |
| /* w22 <= w2 - w4 <= DA - CB */ |
| bn.subm w22, w2, w4 |
| /* w22 <= w22^2 = (DA - CB)^2 */ |
| jal x1, fe_square |
| /* z_3 = w13 <= w22 * w9 = (DA - CB)^2 * x_1 */ |
| bn.mov w23, w9 |
| jal x1, fe_mul |
| bn.mov w13, w22 |
| |
| ret |
| |
| .data |
| |
| /* Modulus p = 2^255 - 19. */ |
| .balign 32 |
| modulus25519: |
| .word 0xffffffed |
| .word 0xffffffff |
| .word 0xffffffff |
| .word 0xffffffff |
| .word 0xffffffff |
| .word 0xffffffff |
| .word 0xffffffff |
| .word 0x7fffffff |
| |
| /* Constant (a - 2) / 4 for Curve25519 = 121665 (see RFC 7748, section 5). */ |
| .balign 32 |
| a24: |
| .word 0x0001db41 |
| .zero 28 |