blob: c3747ce145f50aa0b34bcaee08b993e3f851f915 [file] [log] [blame]
/* Copyright lowRISC contributors. */
/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
/* SPDX-License-Identifier: Apache-2.0 */
/**
* This library contains arithmetic for the scalar field of the Ed25519
* signature scheme, which is modulo the curve order L:
*
* L = 2^252+27742317777372353535851937790883648493 (see RFC 8032, section 5.1)
*
* Elements of this field are used for scalar multiplication of curve points.
* The scalar field should not be confused with the field used for curve
* coordinates, which is modulo p = 2^255 - 19.
*/
/**
* Load constants for the scalar field.
*
* This routine should run before any other operations in this field are used,
* and again if its output is subsequently overwritten.
*
* This routine runs in constant time.
*
* @param[in] w31: all-zero
* @param[out] [w15:w14]: mu = floor(2^512 / L) (precomputed constant)
* @param[out] MOD: L, modulus
*
* clobbered registers: x2, x3, w14, w15, MOD
* clobbered flag groups: FG0
*/
.globl sc_init
sc_init:
/* Load modulus L into the MOD register. */
li x2, 14
la x3, ed25519_scalar_L
bn.lid x2, 0(x3)
bn.wsrw 0x0, w14
/* Load lower half of precomputed constant mu (260 bits).
w14 <= mu mod 2^256 */
la x3, ed25519_scalar_mu_low
bn.lid x2, 0(x3)
/* Higher half of mu is very small (15), so we can simply use addi here.
w15 <= 15 = mu >> 256 */
bn.addi w15, w31, 15
ret
/**
* Fully reduce a 512-bit number modulo L.
*
* Returns c = a mod L.
*
* Uses the Barrett reduction algorithm according to the Handbook of Applied
* Cryptography, section 14.3.3:
* https://cacr.uwaterloo.ca/hac/about/chap14.pdf
*
* For this use-case, b=2^256 and k=1. We will use two constant-time
* conditional subtractions instead of a while loop at the end (HAC shows that
* a maximum of two are needed). Keeping the same variable names but slightly
* simplifying the expression, the algorithm becomes:
*
* mu <- floor((2^512) / L) (precomputed)
* q3 <- (x * mu) >> 512
* r2 <- (q3 * L) mod 2^512
* r <- x + (r2 < r1 ? 2^512 : 0) - r2
* r <- r < L ? r : r - L
* r <- r < L ? r : r - L
* return r
*
* The conceptual strategy here is that we use the precomputed constant mu to
* estimate the quotient Q = floor(x / L). Above, the variable named "q3" is
* the estimated quotient, which can be proven to be off by at most two; as
* stated in HAC, Q - 2 <= q3 <= Q. From there, we can use the estimated
* quotient to estimate the remainder (x mod L = x - Q * L):
*
* x - Q * L <= x - (q3 * L) <= x - (Q - 2) * L
* x mod L <= x - (q3 * L) <= x mod L + 2 * L
*
* Since we know that (x - (q3 * L) < 2^512), we lose no precision by computing
* the product and subtraction modulo 2^512, and we save a few instructions.
* In fact, in this specific case, we can take this idea farther; L is only 253
* bits, so the upper bound of r (x mod L + 2 * L) is guaranteed to fit in 256
* bits. Therefore, it is sufficient to compute (x - (q3 * L)) modulo 2^256,
* which is what this implementation does.
*
* The algorithm used here, as described above, has been checked in the Coq
* proof assistant:
* https://gist.github.com/jadephilipoom/f70e740fbe885bf8b040374eca27a456
*
* Note that the proof covers only the algorithm; it doesn't have the exact
* instructions or a machine model of OTBN. The algorithm definition and the
* proven specification are reproduced below:
*
* Definition sc_reduce (x : N) :=
* let q2 := x * mu in
* let q3 := (q2 / (2 ^ 512)) in
* let r1 := x mod 2^256 in
* let r2 := (q3 * L) mod (2 ^ 256) in
* let r := r1 + (if r1 <? r2 then 2 ^ 256 else 0) - r2 in
* let r := if r <? L then r else r - L in
* let r := if r <? L then r else r - L in
* r.
*
* Lemma sc_reduce_correct x :
* x < 2^512 -> sc_reduce x = x mod L.
*
* Note: It's likely we need only one conditional subtraction here, for the
* same reason only one conditional subtraction is needed for P-256 Barrett
* reduction, but this needs further investigation to be certain and this
* routine is not performance-critical.
*
* This routine runs in constant time.
*
* Flags: Flags have no meaning beyond the scope of this subroutine.
*
* @param[in] [w17:w16]: a, number to be reduced (a < 2^510)
* @param[in] [w15:w14]: mu = floor(2^512 / L) (precomputed constant)
* @param[in] MOD: L, modulus
* @param[in] w31: all-zero
* @param[out] w18: c, result = a mod L
*
* clobbered registers: w10 to w13, w18
* clobbered flag groups: FG0
*/
.globl sc_reduce
sc_reduce:
/* First, compute q3 = (x * mu) >> 512.
Note: As described in HAC, we can probably optimize this by ignoring some
of the lower partial products of (x * mu), since only the high bits are used.
However, this optimization increases the possible error of the estimated
quotient q3, and ensuring it is used safely requires careful analysis.
Since this routine is not performance-critical, we compute the full
product here to be safe.
Since x has 512 bits and mu has 260 bits, rounding up to the nearest
64-bit limbs gives us a 512x320-bit multiplication. */
/* Update the accumulator to hold the carry from the lower 512 bits of the
product. The results from this part are stored in w13 but discarded. */
bn.mulqacc.z w14.0, w16.0, 0
bn.mulqacc w14.0, w16.1, 64
bn.mulqacc.so w13.L, w14.1, w16.0, 64
bn.mulqacc w14.0, w16.2, 0
bn.mulqacc w14.1, w16.1, 0
bn.mulqacc w14.2, w16.0, 0
bn.mulqacc w14.0, w16.3, 64
bn.mulqacc w14.1, w16.2, 64
bn.mulqacc w14.2, w16.1, 64
bn.mulqacc.so w13.U, w14.3, w16.0, 64
bn.mulqacc w14.0, w17.0, 0
bn.mulqacc w14.1, w16.3, 0
bn.mulqacc w14.2, w16.2, 0
bn.mulqacc w14.3, w16.1, 0
bn.mulqacc w15.0, w16.0, 0
bn.mulqacc w14.0, w17.1, 64
bn.mulqacc w14.1, w17.0, 64
bn.mulqacc w14.2, w16.3, 64
bn.mulqacc w14.3, w16.2, 64
bn.mulqacc.so w13.L, w15.0, w16.1, 64
bn.mulqacc w14.0, w17.2, 0
bn.mulqacc w14.1, w17.1, 0
bn.mulqacc w14.2, w17.0, 0
bn.mulqacc w14.3, w16.3, 0
bn.mulqacc w15.0, w16.2, 0
bn.mulqacc w14.0, w17.3, 64
bn.mulqacc w14.1, w17.2, 64
bn.mulqacc w14.2, w17.1, 64
bn.mulqacc w14.3, w17.0, 64
bn.mulqacc.so w13.U, w15.0, w16.3, 64
/* Finish computing the product (x * mu), and store the high bits.
[w13:w12] <= (x * mu) >> 512 = q3 */
bn.mulqacc w14.1, w17.3, 0
bn.mulqacc w14.2, w17.2, 0
bn.mulqacc w14.3, w17.1, 0
bn.mulqacc w15.0, w17.0, 0
bn.mulqacc w14.2, w17.3, 64
bn.mulqacc w14.3, w17.2, 64
bn.mulqacc.so w12.L, w15.0, w17.1, 64
bn.mulqacc w14.3, w17.3, 0
bn.mulqacc w15.0, w17.2, 0
bn.mulqacc.so w12.U, w15.0, w17.3, 64
bn.mulqacc.wo w13, w31.0, w31.0, 0
/* Load L from the MOD register.
w11 <= WSR[0x0] = MOD = L */
bn.wsrr w11, 0x0
/* Compute the value r2 = (q3 * L) mod 2^256. Since q3 has 260 bits and L has
253, we use a 320x256-bit multiplication, but we stop after the lowest 256
bits of the product are computed.
w10 <= ([w13:w12] * w11) mod 2^256 = (q3 * L) mod 2^256 = r2 mod 2^256 */
bn.mulqacc.z w12.0, w11.0, 0
bn.mulqacc w12.0, w11.1, 64
bn.mulqacc.so w10.L, w12.1, w11.0, 64
bn.mulqacc w12.0, w11.2, 0
bn.mulqacc w12.1, w11.1, 0
bn.mulqacc w12.2, w11.0, 0
bn.mulqacc w12.0, w11.3, 64
bn.mulqacc w12.1, w11.2, 64
bn.mulqacc w12.2, w11.1, 64
bn.mulqacc.so w10.U, w12.3, w11.0, 64
/* Compute r = (x - r2) mod 2^512.
Note that the conditional addition in HACS is consistent with the
default behavior of subtraction underflow in OTBN, so there is nothing
extra to do here. Additionally, because we know that (x - r2) < 2^256, it
holds that:
r = (x - r2) mod 2^512 = (x - r2) mod 2^256
= (x mod 2^256 - r2 mod 2^256) mod 2^256. */
/* w18 <= (x mod 2^256 - r2 mod 2^256) mod 2^256 = r */
bn.sub w18, w16, w10
/* Finally, two conditional subtractions to correct potential error. We can't
use addm for the first subtraction because it requires the sum to be less
than 2*L. */
/* First conditional subtraction: subtract L and then add it back if the
borrow flag is set (indicating underflow).
w18 <= w18 < L ? w18 : w18 - L */
bn.sub w18, w18, w11
bn.sel w10, w11, w31, C
bn.add w18, w18, w10
/* Second conditional subtraction can simply use add-modulo.
w18 <= w18 < L ? w18 : w18 - L = x mod L */
bn.addm w18, w18, w31
ret
/**
* Multiply two numbers and reduce modulo L.
*
* Returns c = (a * b) mod L.
*
* The operands a and b need to fit in 256 bits, but need not be fully reduced
* modulo L.
*
* This routine runs in constant time.
*
* Flags: Flags have no meaning beyond the scope of this subroutine.
*
* @param[in] w21: a, first operand
* @param[in] w22: b, second operand
* @param[in] [w15:w14]: mu = floor(2^512 / L) (precomputed constant)
* @param[in] MOD: L, modulus
* @param[in] w31: all-zero
* @param[out] w18: c, result = (a * b) mod L
*
* clobbered registers: w10 to w13, w16 to w18
* clobbered flag groups: FG0
*/
.globl sc_mul
sc_mul:
/* Compute the raw 512-bit product.
[w17:w16] <= a * b */
bn.mulqacc.z w21.0, w22.0, 0
bn.mulqacc w21.0, w22.1, 64
bn.mulqacc.so w16.L, w21.1, w22.0, 64
bn.mulqacc w21.0, w22.2, 0
bn.mulqacc w21.1, w22.1, 0
bn.mulqacc w21.2, w22.0, 0
bn.mulqacc w21.0, w22.3, 64
bn.mulqacc w21.1, w22.2, 64
bn.mulqacc w21.2, w22.1, 64
bn.mulqacc.so w16.U, w21.3, w22.0, 64
bn.mulqacc w21.1, w22.3, 0
bn.mulqacc w21.2, w22.2, 0
bn.mulqacc w21.3, w22.1, 0
bn.mulqacc w21.2, w22.3, 64
bn.mulqacc w21.3, w22.2, 64
bn.mulqacc.wo w17, w21.3, w22.3, 128
/* Reduce modulo L.
w18 <= (a * b) mod L */
jal x1, sc_reduce
ret
.data
/* Modulus L = 2^252+27742317777372353535851937790883648493 */
.balign 32
ed25519_scalar_L:
.word 0x5cf5d3ed
.word 0x5812631a
.word 0xa2f79cd6
.word 0x14def9de
.word 0x00000000
.word 0x00000000
.word 0x00000000
.word 0x10000000
/* Lower half of the precomputed constant mu = floor(2^512 / L) */
.balign 32
ed25519_scalar_mu_low:
.word 0x0a2c131b
.word 0xed9ce5a3
.word 0x086329a7
.word 0x2106215d
.word 0xffffffeb
.word 0xffffffff
.word 0xffffffff
.word 0xffffffff