opensecura / 3p / lowrisc / opentitan / 488e7aabbfe90b76c24a0fe2bd61dbce56258d90 / . / sw / otbn / crypto / ed25519_scalar.s

/* Copyright lowRISC contributors. */ | |

/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ | |

/* SPDX-License-Identifier: Apache-2.0 */ | |

/** | |

* This library contains arithmetic for the scalar field of the Ed25519 | |

* signature scheme, which is modulo the curve order L: | |

* | |

* L = 2^252+27742317777372353535851937790883648493 (see RFC 8032, section 5.1) | |

* | |

* Elements of this field are used for scalar multiplication of curve points. | |

* The scalar field should not be confused with the field used for curve | |

* coordinates, which is modulo p = 2^255 - 19. | |

*/ | |

/** | |

* Load constants for the scalar field. | |

* | |

* This routine should run before any other operations in this field are used, | |

* and again if its output is subsequently overwritten. | |

* | |

* This routine runs in constant time. | |

* | |

* @param[in] w31: all-zero | |

* @param[out] [w15:w14]: mu = floor(2^512 / L) (precomputed constant) | |

* @param[out] MOD: L, modulus | |

* | |

* clobbered registers: x2, x3, w14, w15, MOD | |

* clobbered flag groups: FG0 | |

*/ | |

.globl sc_init | |

sc_init: | |

/* Load modulus L into the MOD register. */ | |

li x2, 14 | |

la x3, ed25519_scalar_L | |

bn.lid x2, 0(x3) | |

bn.wsrw 0x0, w14 | |

/* Load lower half of precomputed constant mu (260 bits). | |

w14 <= mu mod 2^256 */ | |

la x3, ed25519_scalar_mu_low | |

bn.lid x2, 0(x3) | |

/* Higher half of mu is very small (15), so we can simply use addi here. | |

w15 <= 15 = mu >> 256 */ | |

bn.addi w15, w31, 15 | |

ret | |

/** | |

* Fully reduce a 512-bit number modulo L. | |

* | |

* Returns c = a mod L. | |

* | |

* Uses the Barrett reduction algorithm according to the Handbook of Applied | |

* Cryptography, section 14.3.3: | |

* https://cacr.uwaterloo.ca/hac/about/chap14.pdf | |

* | |

* For this use-case, b=2^256 and k=1. We will use two constant-time | |

* conditional subtractions instead of a while loop at the end (HAC shows that | |

* a maximum of two are needed). Keeping the same variable names but slightly | |

* simplifying the expression, the algorithm becomes: | |

* | |

* mu <- floor((2^512) / L) (precomputed) | |

* q3 <- (x * mu) >> 512 | |

* r2 <- (q3 * L) mod 2^512 | |

* r <- x + (r2 < r1 ? 2^512 : 0) - r2 | |

* r <- r < L ? r : r - L | |

* r <- r < L ? r : r - L | |

* return r | |

* | |

* The conceptual strategy here is that we use the precomputed constant mu to | |

* estimate the quotient Q = floor(x / L). Above, the variable named "q3" is | |

* the estimated quotient, which can be proven to be off by at most two; as | |

* stated in HAC, Q - 2 <= q3 <= Q. From there, we can use the estimated | |

* quotient to estimate the remainder (x mod L = x - Q * L): | |

* | |

* x - Q * L <= x - (q3 * L) <= x - (Q - 2) * L | |

* x mod L <= x - (q3 * L) <= x mod L + 2 * L | |

* | |

* Since we know that (x - (q3 * L) < 2^512), we lose no precision by computing | |

* the product and subtraction modulo 2^512, and we save a few instructions. | |

* In fact, in this specific case, we can take this idea farther; L is only 253 | |

* bits, so the upper bound of r (x mod L + 2 * L) is guaranteed to fit in 256 | |

* bits. Therefore, it is sufficient to compute (x - (q3 * L)) modulo 2^256, | |

* which is what this implementation does. | |

* | |

* The algorithm used here, as described above, has been checked in the Coq | |

* proof assistant: | |

* https://gist.github.com/jadephilipoom/f70e740fbe885bf8b040374eca27a456 | |

* | |

* Note that the proof covers only the algorithm; it doesn't have the exact | |

* instructions or a machine model of OTBN. The algorithm definition and the | |

* proven specification are reproduced below: | |

* | |

* Definition sc_reduce (x : N) := | |

* let q2 := x * mu in | |

* let q3 := (q2 / (2 ^ 512)) in | |

* let r1 := x mod 2^256 in | |

* let r2 := (q3 * L) mod (2 ^ 256) in | |

* let r := r1 + (if r1 <? r2 then 2 ^ 256 else 0) - r2 in | |

* let r := if r <? L then r else r - L in | |

* let r := if r <? L then r else r - L in | |

* r. | |

* | |

* Lemma sc_reduce_correct x : | |

* x < 2^512 -> sc_reduce x = x mod L. | |

* | |

* Note: It's likely we need only one conditional subtraction here, for the | |

* same reason only one conditional subtraction is needed for P-256 Barrett | |

* reduction, but this needs further investigation to be certain and this | |

* routine is not performance-critical. | |

* | |

* This routine runs in constant time. | |

* | |

* Flags: Flags have no meaning beyond the scope of this subroutine. | |

* | |

* @param[in] [w17:w16]: a, number to be reduced (a < 2^510) | |

* @param[in] [w15:w14]: mu = floor(2^512 / L) (precomputed constant) | |

* @param[in] MOD: L, modulus | |

* @param[in] w31: all-zero | |

* @param[out] w18: c, result = a mod L | |

* | |

* clobbered registers: w10 to w13, w18 | |

* clobbered flag groups: FG0 | |

*/ | |

.globl sc_reduce | |

sc_reduce: | |

/* First, compute q3 = (x * mu) >> 512. | |

Note: As described in HAC, we can probably optimize this by ignoring some | |

of the lower partial products of (x * mu), since only the high bits are used. | |

However, this optimization increases the possible error of the estimated | |

quotient q3, and ensuring it is used safely requires careful analysis. | |

Since this routine is not performance-critical, we compute the full | |

product here to be safe. | |

Since x has 512 bits and mu has 260 bits, rounding up to the nearest | |

64-bit limbs gives us a 512x320-bit multiplication. */ | |

/* Update the accumulator to hold the carry from the lower 512 bits of the | |

product. The results from this part are stored in w13 but discarded. */ | |

bn.mulqacc.z w14.0, w16.0, 0 | |

bn.mulqacc w14.0, w16.1, 64 | |

bn.mulqacc.so w13.L, w14.1, w16.0, 64 | |

bn.mulqacc w14.0, w16.2, 0 | |

bn.mulqacc w14.1, w16.1, 0 | |

bn.mulqacc w14.2, w16.0, 0 | |

bn.mulqacc w14.0, w16.3, 64 | |

bn.mulqacc w14.1, w16.2, 64 | |

bn.mulqacc w14.2, w16.1, 64 | |

bn.mulqacc.so w13.U, w14.3, w16.0, 64 | |

bn.mulqacc w14.0, w17.0, 0 | |

bn.mulqacc w14.1, w16.3, 0 | |

bn.mulqacc w14.2, w16.2, 0 | |

bn.mulqacc w14.3, w16.1, 0 | |

bn.mulqacc w15.0, w16.0, 0 | |

bn.mulqacc w14.0, w17.1, 64 | |

bn.mulqacc w14.1, w17.0, 64 | |

bn.mulqacc w14.2, w16.3, 64 | |

bn.mulqacc w14.3, w16.2, 64 | |

bn.mulqacc.so w13.L, w15.0, w16.1, 64 | |

bn.mulqacc w14.0, w17.2, 0 | |

bn.mulqacc w14.1, w17.1, 0 | |

bn.mulqacc w14.2, w17.0, 0 | |

bn.mulqacc w14.3, w16.3, 0 | |

bn.mulqacc w15.0, w16.2, 0 | |

bn.mulqacc w14.0, w17.3, 64 | |

bn.mulqacc w14.1, w17.2, 64 | |

bn.mulqacc w14.2, w17.1, 64 | |

bn.mulqacc w14.3, w17.0, 64 | |

bn.mulqacc.so w13.U, w15.0, w16.3, 64 | |

/* Finish computing the product (x * mu), and store the high bits. | |

[w13:w12] <= (x * mu) >> 512 = q3 */ | |

bn.mulqacc w14.1, w17.3, 0 | |

bn.mulqacc w14.2, w17.2, 0 | |

bn.mulqacc w14.3, w17.1, 0 | |

bn.mulqacc w15.0, w17.0, 0 | |

bn.mulqacc w14.2, w17.3, 64 | |

bn.mulqacc w14.3, w17.2, 64 | |

bn.mulqacc.so w12.L, w15.0, w17.1, 64 | |

bn.mulqacc w14.3, w17.3, 0 | |

bn.mulqacc w15.0, w17.2, 0 | |

bn.mulqacc.so w12.U, w15.0, w17.3, 64 | |

bn.mulqacc.wo w13, w31.0, w31.0, 0 | |

/* Load L from the MOD register. | |

w11 <= WSR[0x0] = MOD = L */ | |

bn.wsrr w11, 0x0 | |

/* Compute the value r2 = (q3 * L) mod 2^256. Since q3 has 260 bits and L has | |

253, we use a 320x256-bit multiplication, but we stop after the lowest 256 | |

bits of the product are computed. | |

w10 <= ([w13:w12] * w11) mod 2^256 = (q3 * L) mod 2^256 = r2 mod 2^256 */ | |

bn.mulqacc.z w12.0, w11.0, 0 | |

bn.mulqacc w12.0, w11.1, 64 | |

bn.mulqacc.so w10.L, w12.1, w11.0, 64 | |

bn.mulqacc w12.0, w11.2, 0 | |

bn.mulqacc w12.1, w11.1, 0 | |

bn.mulqacc w12.2, w11.0, 0 | |

bn.mulqacc w12.0, w11.3, 64 | |

bn.mulqacc w12.1, w11.2, 64 | |

bn.mulqacc w12.2, w11.1, 64 | |

bn.mulqacc.so w10.U, w12.3, w11.0, 64 | |

/* Compute r = (x - r2) mod 2^512. | |

Note that the conditional addition in HACS is consistent with the | |

default behavior of subtraction underflow in OTBN, so there is nothing | |

extra to do here. Additionally, because we know that (x - r2) < 2^256, it | |

holds that: | |

r = (x - r2) mod 2^512 = (x - r2) mod 2^256 | |

= (x mod 2^256 - r2 mod 2^256) mod 2^256. */ | |

/* w18 <= (x mod 2^256 - r2 mod 2^256) mod 2^256 = r */ | |

bn.sub w18, w16, w10 | |

/* Finally, two conditional subtractions to correct potential error. We can't | |

use addm for the first subtraction because it requires the sum to be less | |

than 2*L. */ | |

/* First conditional subtraction: subtract L and then add it back if the | |

borrow flag is set (indicating underflow). | |

w18 <= w18 < L ? w18 : w18 - L */ | |

bn.sub w18, w18, w11 | |

bn.sel w10, w11, w31, C | |

bn.add w18, w18, w10 | |

/* Second conditional subtraction can simply use add-modulo. | |

w18 <= w18 < L ? w18 : w18 - L = x mod L */ | |

bn.addm w18, w18, w31 | |

ret | |

/** | |

* Multiply two numbers and reduce modulo L. | |

* | |

* Returns c = (a * b) mod L. | |

* | |

* The operands a and b need to fit in 256 bits, but need not be fully reduced | |

* modulo L. | |

* | |

* This routine runs in constant time. | |

* | |

* Flags: Flags have no meaning beyond the scope of this subroutine. | |

* | |

* @param[in] w21: a, first operand | |

* @param[in] w22: b, second operand | |

* @param[in] [w15:w14]: mu = floor(2^512 / L) (precomputed constant) | |

* @param[in] MOD: L, modulus | |

* @param[in] w31: all-zero | |

* @param[out] w18: c, result = (a * b) mod L | |

* | |

* clobbered registers: w10 to w13, w16 to w18 | |

* clobbered flag groups: FG0 | |

*/ | |

.globl sc_mul | |

sc_mul: | |

/* Compute the raw 512-bit product. | |

[w17:w16] <= a * b */ | |

bn.mulqacc.z w21.0, w22.0, 0 | |

bn.mulqacc w21.0, w22.1, 64 | |

bn.mulqacc.so w16.L, w21.1, w22.0, 64 | |

bn.mulqacc w21.0, w22.2, 0 | |

bn.mulqacc w21.1, w22.1, 0 | |

bn.mulqacc w21.2, w22.0, 0 | |

bn.mulqacc w21.0, w22.3, 64 | |

bn.mulqacc w21.1, w22.2, 64 | |

bn.mulqacc w21.2, w22.1, 64 | |

bn.mulqacc.so w16.U, w21.3, w22.0, 64 | |

bn.mulqacc w21.1, w22.3, 0 | |

bn.mulqacc w21.2, w22.2, 0 | |

bn.mulqacc w21.3, w22.1, 0 | |

bn.mulqacc w21.2, w22.3, 64 | |

bn.mulqacc w21.3, w22.2, 64 | |

bn.mulqacc.wo w17, w21.3, w22.3, 128 | |

/* Reduce modulo L. | |

w18 <= (a * b) mod L */ | |

jal x1, sc_reduce | |

ret | |

.data | |

/* Modulus L = 2^252+27742317777372353535851937790883648493 */ | |

.balign 32 | |

ed25519_scalar_L: | |

.word 0x5cf5d3ed | |

.word 0x5812631a | |

.word 0xa2f79cd6 | |

.word 0x14def9de | |

.word 0x00000000 | |

.word 0x00000000 | |

.word 0x00000000 | |

.word 0x10000000 | |

/* Lower half of the precomputed constant mu = floor(2^512 / L) */ | |

.balign 32 | |

ed25519_scalar_mu_low: | |

.word 0x0a2c131b | |

.word 0xed9ce5a3 | |

.word 0x086329a7 | |

.word 0x2106215d | |

.word 0xffffffeb | |

.word 0xffffffff | |

.word 0xffffffff | |

.word 0xffffffff |