blob: facf6dfc6be5f40b57d0b5528b850a353287aac8 [file] [log] [blame]
/* Copyright lowRISC contributors. */
/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
/* SPDX-License-Identifier: Apache-2.0 */
/**
* This library contains code for X25519, for use in elliptic-curve Diffie
* Hellman key exchange.
*
* Helpful references:
* RFC: https://datatracker.ietf.org/doc/html/rfc7748
* Academic paper: https://www.iacr.org/cryptodb/archive/2006/PKC/3351/3351.pdf
*/
/**
* Top-level X25519 function, as defined in RFC 7748.
*
* This routine runs in constant time.
*
* Flags: Flags have no meaning beyond the scope of this subroutine.
*
* @param[in] w8: enc(k), encoded scalar (256-bit random number)
* @param[in] w9: enc(u), encoded Montgomery u-coordinate (256 bits)
* @param[out] w22: result, X25519(k, u) as an encoded u-coordinate
*
* clobbered registers: w2 to w24
* clobbered flag groups: FG0
*/
.globl X25519
X25519:
/* Prepare all-zero register. */
bn.xor w31, w31, w31
/* Prepare constant for field operations.
w19 <= 19 */
bn.addi w19, w31, 19
/* Load modulus from DMEM.
MOD <= dmem[modulus25519] = 2^255 - 19 = p */
li x2, 2
la x3, modulus25519
bn.lid x2, 0(x3)
bn.wsrw 0x0, w2
/* Decode scalar. From RFC 7748, section 5:
"For X25519, in order to decode 32 random bytes as an integer scalar, set
the three least significant bits of the first byte and the most significant bit
of the last to zero, set the second most significant bit of the last byte to 1
and, finally, decode as little-endian." */
/* w8 <= (w8 >> 3) = enc(k) >> 3 */
bn.rshi w8, w31, w8 >> 3
/* w8 <= w8 << 5 = ((enc(k) >> 3) << 5) mod 2^256 */
bn.rshi w8, w8, w31 >> 251
/* w8 <= 2^254 + (w8 >> 2) = ((enc(k) >> 3) << 3) mod 2^254 + 2^254 = k*/
bn.addi w7, w31, 1
bn.rshi w8, w7, w8 >> 2
/* Decode point u-coordinate. As specified in RFC 7748, section 5, we must
zero the most significant bit and accept non-canonical values (i.e. cases
where p <= u) as if they had been reduced modulo p. */
/* w7 <= ~w31 = 2^256 - 1 */
bn.not w7, w31
/* w7 <= w7 >> 1 = 2^255 - 1 */
bn.rshi w7, w31, w7 >> 1
/* w9 <= w9 & w7 = enc(u) mod 2^255 */
bn.and w9, w9, w7
/* w9 <= w9 mod p <= (enc(u) mod 2^255) mod p = u */
bn.addm w9, w9, w31
/* Note that, in contrast to Ed25519, the given point does not have to be
validated; see the original Curve25519 paper for discussion (search
for "small-subgroup attacks" and "free key validation").
https://www.iacr.org/cryptodb/archive/2006/PKC/3351/3351.pdf */
/* Perform scalar multiplication.
w22 <= scalar_mult(k, u) = X25519(k, u) */
jal x1, scalar_mult
/* Since the scalar multiplication routine has already reduced its result
modulo p and OTBN is little-endian, encoding the result is a no-op. */
ret
/**
* Elliptic-curve scalar multiplication for Curve25519.
*
* Returns B = kA.
*
* Multiplies a point on the curve (A) with a scalar (k). Uses the Montgomery
* ladder algorithm as in RFC 7748. For the Montgomery ladder, we only need to
* consider the u-coordinate of curve points.
*
* See RFC 7748, section 5:
* https://datatracker.ietf.org/doc/html/rfc7748#section-5
*
* This routine runs in constant time.
*
* Flags: Flags have no meaning beyond the scope of this subroutine.
*
* @param[in] w8: k, 255-bit scalar value
* @param[in] w9: u, Montgomery u-coordinate of point A (k < p)
* @param[in] w19: constant, w19 = 19
* @param[in] w31, all-zero
* @param[in] MOD: p, modulus = 2^255 - 19
* @param[out] w22: result, Montgomery u-coordinate of point kA
*
* clobbered registers: w2 to w7, w10 to w18, w20 to w24
* clobbered flag groups: FG0
*/
scalar_mult:
/* Load the constant a24:
w24 <= dmem[a24] = 121665 */
li x2, 24
la x3, a24
bn.lid x2, 0(x3)
/* Set initial values. From RFC 7748:
x_2 = 1
z_2 = 0
x_3 = u
z_3 = 1
swap = 0
*/
/* x_2 = w10 <= 1 */
bn.addi w10, w31, 1
/* z_2 = w12 <= 0 */
bn.mov w12, w31
/* x_3 = w11 <= u */
bn.mov w11, w9
/* z_3 = w13 <= 1 */
bn.addi w13, w31, 1
/* swap = w15 <= 0 */
bn.mov w15, w31
/* Initialize w4 for loop.
w14 <= (k << 1) mod 2^256 */
bn.rshi w14, w8, w31 >> 255
/* Main loop. From RFC 7748 (with `ladderstep` factored into a separate step):
For t = bits-1 down to 0:
k_t = (k >> t) & 1
swap ^= k_t
(x_2, x_3) = cswap(swap, x_2, x_3)
(z_2, z_3) = cswap(swap, z_2, z_3)
swap = k_t
ladderstep(u, x_2, x_3, z_2, z_3)
Loop invariants:
w14 = (k << (255-t)) mod 2^256
w9 = u
Loop temporary registers: w6, w7
*/
loopi 255, 12
/* w7 <= w14 >> 255 = (k << (255-t)) mod 2^256 >> 255 = k[t] = k_t */
bn.rshi w7, w31, w14 >> 255
/* w14 <= w14 << 1 = (k << (255-(t-1)) mod 2^256 */
bn.rshi w14, w14, w31 >> 255
/* swap = w15 = w15 ^ w7 = swap ^ k_t */
bn.xor w15, w15, w7
/* Note: The xor above set FG0.Z if and only if swap ^ k_t is zero. */
/* Conditionally swap x_2 and x_3. */
/* w6 <= if swap =? 0 then x_2 else x_3 */
bn.sel w6, w10, w11, FG0.Z
/* x_3 = w11 <= if swap =? 0 then x_3 else x_2 */
bn.sel w11, w11, w10, FG0.Z
/* x_2 = w10 <= w6 = if swap =? 0 then x_2 else x_3 */
bn.mov w10, w6
/* Conditionally swap z_2 and z_3. */
/* w6 <= if swap =? 0 then z_2 else z_3 */
bn.sel w6, w12, w13, FG0.Z
/* z_3 = w13 <= if swap =? 0 then z_3 else z_2 */
bn.sel w13, w13, w12, FG0.Z
/* z_2 = w12 <= w6 = if swap =? 0 then z_2 else z_3 */
bn.mov w12, w6
/* swap = w15 <= w7 = k_t */
bn.mov w15, w7
jal x1, ladderstep
nop
/* Loop done; final steps. From RFC 7748:
(x_2, x_3) = cswap(swap, x_2, x_3)
(z_2, z_3) = cswap(swap, z_2, z_3)
Return x_2 * (z_2^(p - 2))
*/
/* Set the FG0.Z flag if swap (w15) is 0. */
bn.add w15, w15, w31
/* Conditionally swap x_2 and x_3. */
/* w6 <= if swap =? 0 then x_2 else x_3 */
bn.sel w6, w10, w11, FG0.Z
/* x_3 = w11 <= if swap =? 0 then x_3 else x_2 */
bn.sel w11, w11, w10, FG0.Z
/* x_2 = w10 <= w6 = if swap =? 0 then x_2 else x_3 */
bn.mov w10, w6
/* Conditionally swap z_2 and z_3. */
/* w6 <= if swap =? 0 then z_2 else z_3 */
bn.sel w6, w12, w13, FG0.Z
/* z_3 = w13 <= if swap =? 0 then z_3 else z_2 */
bn.sel w13, w13, w12, FG0.Z
/* z_2 = w12 <= w6 = if swap =? 0 then z_2 else z_3 */
bn.mov w12, w6
/* w22 <= fe_inv(w12) = z_2^(p-2) */
bn.mov w16, w12
jal x1, fe_inv
/* w22 <= w22 * w10 = (z_2^(p-2)) * x_2 */
bn.mov w23, w10
jal x1, fe_mul
ret
/**
* Performs the core arithmetic step of the Montgomery ladder.
*
* This subroutine is intended to be used as part of a Montgomery ladder
* implementation. It performs the core arithmetic in the main loop, i.e. the
* following steps (from RFC 7748, section 5):
* A = x_2 + z_2
* AA = A^2
* B = x_2 - z_2
* BB = B^2
* E = AA - BB
* C = x_3 + z_3
* D = x_3 - z_3
* DA = D * A
* CB = C * B
* x_3 = (DA + CB)^2
* z_3 = x_1 * (DA - CB)^2
* x_2 = AA * BB
* z_2 = E * (AA + a24 * E)
*
* All arithmetic (+, -, *, ^) above is in the finite field GF(2^255-19).
*
* TODO: investigate alternative laddersteps, such as the one in
* curve25519-donna, which may be faster.
*
* This routine runs in constant time.
*
* Flags: Flags have no meaning beyond the scope of this subroutine.
*
* @param[in] w9: x_1, Montgomery u-coordinate of point A (k < p)
* @param[in] w19: constant, w19 = 19
* @param[in] w24: constant a24 = 121665
* @param[in] w31: all-zero
* @param[in] MOD: p, modulus = 2^255 - 19
* @param[in,out] w10: x_2
* @param[in,out] w11: x_3
* @param[in,out] w12: z_2
* @param[in,out] w13: z_3
*
* clobbered registers: w2 to w5, w10 to w13, w17, w18, w20 to w23
* clobbered flag groups: FG0
*/
ladderstep:
/* First, compute the new x_2 and z_2:
A = x_2 + z_2
AA = A^2
B = x_2 - z_2
BB = B^2
E = AA - BB
x_2 = AA * BB
z_2 = E * (AA + a24 * E) */
/* w2 <= w10 + w12 = x_2 + z_2 = A */
bn.addm w2, w10, w12
/* w3 <= w2^2 = A^2 = AA */
bn.mov w22, w2
jal x1, fe_square
bn.mov w3, w22
/* w4 <= w10 - w12 = x_2 - z_2 = B */
bn.subm w4, w10, w12
/* w22 <= w4^2 = B^2 = BB */
bn.mov w22, w4
jal x1, fe_square
/* w5 <= w3 - w22 = AA - BB = E */
bn.subm w5, w3, w22
/* x_2 = w10 <= w22 * w3 = BB * AA */
bn.mov w23, w3
jal x1, fe_mul
bn.mov w10, w22
/* w23 <= w5 = E */
bn.mov w23, w5
/* TODO: create faster, specialized fe_mul for a24 */
/* w22 <= w24 * w23 = a24 * E */
bn.mov w22, w24
jal x1, fe_mul
/* w22 <= w3 + w22 = AA + (a24 * E) */
bn.add w22, w3, w22
/* z_2 = w12 <= w22 * w23 = (AA + a24 * E) * E */
jal x1, fe_mul
bn.mov w12, w22
/* Now, compute the new x_3 and z_3, using A and B from the computation
above:
C = x_3 + z_3
D = x_3 - z_3
DA = D * A
CB = C * B
x_3 = (DA + CB)^2
z_3 = x_1 * (DA - CB)^2 */
/* w22 <= w11 + w13 = x_3 + z_3 = C */
bn.addm w22, w11, w13
/* w4 <= w22 * w4 = C * B = CB */
bn.mov w23, w4
jal x1, fe_mul
bn.mov w4, w22
/* w22 <= w11 - w13 = x_3 - z_3 = D */
bn.subm w22, w11, w13
/* w2 <= w22 * w2 = D * A = DA */
bn.mov w23, w2
jal x1, fe_mul
bn.mov w2, w22
/* w22 <= w2 + w4 <= DA + CB */
bn.addm w22, w2, w4
/* x_3 = w11 <= w22^2 = (DA + CB)^2 */
jal x1, fe_square
bn.mov w11, w22
/* w22 <= w2 - w4 <= DA - CB */
bn.subm w22, w2, w4
/* w22 <= w22^2 = (DA - CB)^2 */
jal x1, fe_square
/* z_3 = w13 <= w22 * w9 = (DA - CB)^2 * x_1 */
bn.mov w23, w9
jal x1, fe_mul
bn.mov w13, w22
ret
.data
/* Modulus p = 2^255 - 19. */
.balign 32
modulus25519:
.word 0xffffffed
.word 0xffffffff
.word 0xffffffff
.word 0xffffffff
.word 0xffffffff
.word 0xffffffff
.word 0xffffffff
.word 0x7fffffff
/* Constant (a - 2) / 4 for Curve25519 = 121665 (see RFC 7748, section 5). */
.balign 32
a24:
.word 0x0001db41
.zero 28