| /* Copyright lowRISC Contributors. |
| * Copyright 2016 The Chromium OS Authors. All rights reserved. |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE.dcrypto file. |
| * |
| * Derived from code in |
| * https://chromium.googlesource.com/chromiumos/platform/ec/+/refs/heads/cr50_stab/chip/g/dcrypto/dcrypto_p256.c |
| */ |
| |
| .globl p256_isoncurve |
| .globl p256_scalar_mult |
| .globl p256_base_mult |
| .globl p256_sign |
| .globl p256_verify |
| .globl proj_add |
| .globl p256_generate_k |
| .globl p256_generate_random_key |
| .globl p256_key_from_seed |
| |
| .text |
| |
| /** |
| * Trigger a fault if the FG0.Z flag is 1. |
| * |
| * If the flag is 1, then this routine will trigger an `ILLEGAL_INSN` error and |
| * abort the OTBN program. If the flag is 0, the routine will essentially do |
| * nothing. |
| * |
| * NOTE: Be careful when calling this routine that the FG0.Z flag is not |
| * sensitive; since aborting the program will be quicker than completing it, |
| * the flag's value is likely clearly visible to an attacker through timing. |
| * |
| * @param[in] w31: all-zero |
| * @param[in] FG0.Z: boolean indicating fault condition |
| * |
| * clobbered registers: x2 |
| * clobbered flag groups: none |
| */ |
| trigger_fault_if_fg0_z: |
| /* Read the FG0.Z flag (position 3). |
| x2 <= FG0.Z */ |
| csrrw x2, 0x7c0, x0 |
| andi x2, x2, 8 |
| srli x2, x2, 3 |
| |
| /* Subtract FG0.Z from 0. |
| x2 <= 0 - x2 = FG0.Z ? 2^32 - 1 : 0 */ |
| sub x2, x0, x2 |
| |
| /* The `bn.lid` instruction causes an `BAD_DATA_ADDR` error if the |
| memory address is out of bounds. Therefore, if FG0.Z is 1, this |
| instruction causes an error, but if FG0.Z is 0 it simply loads the word at |
| address 0 into w31. */ |
| li x3, 31 |
| bn.lid x3, 0(x2) |
| |
| /* If we get here, the flag must have been 0. Restore w31 to zero and return. |
| w31 <= 0 */ |
| bn.xor w31, w31, w31 |
| |
| ret |
| |
| /** |
| * Reduce a 512-bit value by a 256-bit P-256 modulus (either n or p). |
| * |
| * Returns c = a mod m |
| * |
| * Expects a 512 bit input, a 256 bit modulus and pre-computed parameter u for |
| * Barrett reduction (usually greek mu in literature). u is expected without |
| * the leading 1 at bit 256. u has to be pre-computed as u = floor(2^512/p). |
| * This guarantees that u > 2^256, however, in order for u to be at most |
| * 2^257-1, it has to be ensured that m >= 2^255 + 1. |
| * |
| * This implementation mostly follows the description in the |
| * "Handbook of Applied Cryptography" in Algorithm 14.42. |
| * Differences: |
| * - The computation of q2 ignores the MSbs of q1 and u to allow using |
| * a 256x256 bit multiplication. This is compensated later by |
| * individual (conditional) additions. |
| * - The truncations in step 2 of HAC 14.42 in the form of (... mod b^(k+1) ) |
| * are not implemented here and the full register width is used. This |
| * allows to omit computation of r1 (since r1=x) and step 3 of HAC 14.42 |
| * - There is only a single conditional subtraction of the modulus at the end |
| * |
| * Note that this implementation is targeted and tested to be used with modulus |
| * and Barrett constant of the P-256 underlying finite field only. For a |
| * generic modulus a 2nd conditional subtraction of the modulus has to be |
| * added or the modulus has to be in a range such that it can be mathematically |
| * proven that a single subtraction is sufficient. The following condition is |
| * sufficient (although potentially not necessary): |
| * b * (b^2k mod m) + mu <= b^(k+1) |
| * |
| * For OTBN with a 256-bit modulus, we have b=2^256 and k=1, so this is: |
| * 2^256 * (2^512 mod m) + mu <= 2^512 |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * @param[in] [w19,w20]: a, input (512 bits) such that a < 2^256 * m |
| * @param[in] w22: correction factor, msb(a) * u |
| * @param[in] w29: m, modulus, curve order n or finite field modulus p |
| * @param[in] w28: u, lower 256 bit of Barrett constant for m |
| * @param[in] w31: all-zero |
| * @param[in] MOD: m, modulus |
| * @param[out] w19: c, result |
| * |
| * clobbered registers: w19, w20, w21, w22, w23, w24, w25 |
| * clobbered flag groups: FG0 |
| */ |
| p256_reduce: |
| /* Compute q1' = q1[255:0] = x >> 255 |
| w21 = q1' = [w20, w19] >> 255 */ |
| bn.rshi w21, w20, w19 >> 255 |
| |
| /* Compute q2 = q1*u |
| Instead of full q2 (which would be up to 514 bits) we ignore the MSb of u |
| and the MSb of q1 and correct this later. This allows using a 256x256 |
| multiplier. |
| => max. length q2': 512 bit |
| q2' = q1[255:0]*u[255:0] = [w20, w19] = w21 * w28 */ |
| bn.mulqacc.z w21.0, w28.0, 0 |
| bn.mulqacc w21.1, w28.0, 64 |
| bn.mulqacc.so w23.L, w21.0, w28.1, 64 |
| bn.mulqacc w21.2, w28.0, 0 |
| bn.mulqacc w21.1, w28.1, 0 |
| bn.mulqacc w21.0, w28.2, 0 |
| bn.mulqacc w21.3, w28.0, 64 |
| bn.mulqacc w21.2, w28.1, 64 |
| bn.mulqacc w21.1, w28.2, 64 |
| bn.mulqacc.so w23.U, w21.0, w28.3, 64 |
| bn.mulqacc w21.3, w28.1, 0 |
| bn.mulqacc w21.2, w28.2, 0 |
| bn.mulqacc w21.1, w28.3, 0 |
| bn.mulqacc w21.3, w28.2, 64 |
| bn.mulqacc.so w24.L, w21.2, w28.3, 64 |
| bn.mulqacc.so w24.U, w21.3, w28.3, 0 |
| bn.add w20, w20, w31 |
| |
| /* q3 = q2 >> 257 |
| In this step, the compensation for the neglected MSbs of q1 and u is |
| carried out underway. To add them in the q2 domain, they would have to be |
| left shifted by 256 bit first. To directly add them we first shift q2' by |
| 256 bit to the right (by dropping the lower 256 bit), perform the |
| additions, and shift the result another bit to the right. |
| |
| /* w25 = q1[256] = x[511:256] >> 255 */ |
| bn.rshi w25, w31, w20 >> 255 |
| |
| /* compensate for neglected MSB of u, by adding full q1 |
| this is unconditional since MSB of u is always 1 |
| [w25, w24] = q2'' <= q2'[511:256] + q1 = [w25, w24] <= w24 + [w25, w21] */ |
| bn.add w24, w24, w21 |
| bn.addc w25, w25, w31 |
| |
| /* compensate for neglected MSB of q1, by conditionally adding u */ |
| /* [w25, w24] = q2''' <= q2'' + [0 or u] = [w25, w24] + w22 */ |
| bn.add w24, w24, w22 |
| bn.addc w25, w25, w31 |
| |
| /* q3 = w21 = q2''' >> 1 */ |
| bn.rshi w21, w25, w24 >> 1 |
| |
| /* [w23, w22] <= q3 * p */ |
| bn.mulqacc.z w29.0, w21.0, 0 |
| bn.mulqacc w29.1, w21.0, 64 |
| bn.mulqacc.so w22.L, w29.0, w21.1, 64 |
| bn.mulqacc w29.2, w21.0, 0 |
| bn.mulqacc w29.1, w21.1, 0 |
| bn.mulqacc w29.0, w21.2, 0 |
| bn.mulqacc w29.3, w21.0, 64 |
| bn.mulqacc w29.2, w21.1, 64 |
| bn.mulqacc w29.1, w21.2, 64 |
| bn.mulqacc.so w22.U, w29.0, w21.3, 64 |
| bn.mulqacc w29.3, w21.1, 0 |
| bn.mulqacc w29.2, w21.2, 0 |
| bn.mulqacc w29.1, w21.3, 0 |
| bn.mulqacc w29.3, w21.2, 64 |
| bn.mulqacc.so w23.L, w29.2, w21.3, 64 |
| bn.mulqacc.so w23.U, w29.3, w21.3, 0 |
| bn.add w23, w23, w31 |
| |
| /* We compute the final remainder r by subtracting the estimate q3 from x. |
| In the generic algorithm, r is already the reduced result or it is off by |
| either p or 2p. For the special case of the modulus of P-256 it can be |
| shown that r can only be off by max p. Therefore, only a single |
| conditional correction is required. |
| [w20, w22] = r <= [w20, w19] - [w23, w22] = x - q3*p */ |
| bn.sub w22, w19, w22 |
| bn.subb w20, w20, w23 |
| |
| /* r cannot be wider than 257 bit. Therefore it is r > p if bit 256 of r is |
| set and we need to subtract the modulus */ |
| bn.sel w21, w29, w31, L |
| bn.sub w21, w22, w21 |
| |
| /* this performs the correction in case r is only 256 bit long but still |
| greater than the modulus */ |
| bn.addm w19, w21, w31 |
| |
| ret |
| |
| /** |
| * 256-bit modular multiplication for P-256 coordinate and scalar fields. |
| * |
| * Returns c = a * b mod m |
| * |
| * Expects two 256 bit operands, 256 bit modulus and pre-computed parameter u |
| * for Barrett reduction (usually greek mu in literature). See the note above |
| * `p256_reduce` for details. |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * @param[in] w24: a, first 256 bit operand (a * b < 2^256 * p) |
| * @param[in] w25: b, second 256 bit operand (a * b < 2^256 * p) |
| * @param[in] w29: m, modulus, curve order n or finite field modulus p |
| * @param[in] w28: u, lower 256 bit of Barrett constant for curve P-256 |
| * @param[in] w31: all-zero |
| * @param[in] MOD: p, modulus of P-256 underlying finite field |
| * @param[out] w19: c, result |
| * |
| * clobbered registers: w19, w20, w21, w22, w23, w24, w25 |
| * clobbered flag groups: FG0 |
| */ |
| mod_mul_256x256: |
| /* Compute the integer product of the operands x = a * b |
| x = [w20, w19] = a * b = w24 * w25 |
| => max. length x: 512 bit */ |
| bn.mulqacc.z w24.0, w25.0, 0 |
| bn.mulqacc w24.1, w25.0, 64 |
| bn.mulqacc.so w19.L, w24.0, w25.1, 64 |
| bn.mulqacc w24.2, w25.0, 0 |
| bn.mulqacc w24.1, w25.1, 0 |
| bn.mulqacc w24.0, w25.2, 0 |
| bn.mulqacc w24.3, w25.0, 64 |
| bn.mulqacc w24.2, w25.1, 64 |
| bn.mulqacc w24.1, w25.2, 64 |
| bn.mulqacc.so w19.U, w24.0, w25.3, 64 |
| bn.mulqacc w24.3, w25.1, 0 |
| bn.mulqacc w24.2, w25.2, 0 |
| bn.mulqacc w24.1, w25.3, 0 |
| bn.mulqacc w24.3, w25.2, 64 |
| bn.mulqacc.so w20.L, w24.2, w25.3, 64 |
| bn.mulqacc.so w20.U, w24.3, w25.3, 0 |
| |
| /* Store correction factor to compensate for later neglected MSb of x. |
| x is 512 bit wide and therefore the 255 bit right shifted version q1 |
| (below) contains 257 bit. Bit 256 of q1 is neglected to allow using a |
| 256x256 multiplier. For the MSb of x being set we temporary store u |
| (or zero) here to be used in a later constant time correction of a |
| multiplication with u. Note that this requires the MSb flag being carried |
| over from the multiplication routine. */ |
| bn.sel w22, w28, w31, M |
| |
| /* Reduce product modulo m. */ |
| jal x1, p256_reduce |
| |
| ret |
| |
| /** |
| * 320- by 128-bit modular multiplication for P-256 coordinate and scalar fields. |
| * |
| * Returns c = a * b mod m |
| * |
| * Expects two operands, 256 bit modulus and pre-computed parameter u |
| * for Barrett reduction (usually greek mu in literature). See the note above |
| * `p256_reduce` for details. |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * @param[in] w24: a0, low part of 320-bit operand a (256 bits) |
| * @param[in] w25: a1, high part of 320-bit operand a (64 bits) |
| * @param[in] w26: b, second operand (128 bits) |
| * @param[in] w29: m, modulus, curve order n or finite field modulus p |
| * @param[in] w28: u, lower 256 bit of Barrett constant for curve P-256 |
| * @param[in] w31: all-zero |
| * @param[in] MOD: p, modulus of P-256 underlying finite field |
| * @param[out] w19: c, result |
| * |
| * clobbered registers: w19, w20, w21, w22, w23, w24, w25 |
| * clobbered flag groups: FG0 |
| */ |
| mod_mul_320x128: |
| /* Compute the integer product of the operands x = a * b |
| x = [w20, w19] = a * b = w24 * w25 |
| => max. length of x: 448 bit */ |
| bn.mulqacc.z w24.0, w26.0, 0 |
| bn.mulqacc w24.0, w26.1, 64 |
| bn.mulqacc.so w19.L, w24.1, w26.0, 64 |
| bn.mulqacc w24.1, w26.1, 0 |
| bn.mulqacc w24.2, w26.0, 0 |
| bn.mulqacc w24.2, w26.1, 64 |
| bn.mulqacc.so w19.U, w24.3, w26.0, 64 |
| bn.mulqacc w24.3, w26.1, 0 |
| bn.mulqacc w25.0, w26.0, 0 |
| bn.mulqacc.wo w20, w25.0, w26.1, 64 |
| |
| /* Store correction factor to compensate for later neglected MSb of x. |
| x is 512 bit wide and therefore the 255 bit right shifted version q1 |
| (below) contains 257 bit. Bit 256 of q1 is neglected to allow using a |
| 256x256 multiplier. For the MSb of x being set we temporary store u |
| (or zero) here to be used in a later constant time correction of a |
| multiplication with u. Note that this requires the MSb flag being carried |
| over from the multiplication routine. */ |
| bn.sel w22, w28, w31, M |
| |
| /* Reduce product modulo m. */ |
| jal x1, p256_reduce |
| |
| ret |
| |
| /** |
| * Checks if a point is a valid curve point on curve P-256 (secp256r1) |
| * |
| * Returns r = x^3 + ax + b mod p |
| * and s = y^2 mod p |
| * with x,y being the affine coordinates of the curve point |
| * a, b and p being the domain parameters of P-256 |
| * |
| * This routine checks if a point with given x- and y-coordinate is a valid |
| * curve point on P-256. |
| * The routine checks whether the coordinates are a solution of the |
| * Weierstrass equation y^2 = x^3 + ax + b mod p. |
| * The routine makes use of the property that the domain parameter 'a' can be |
| * written as a=-3 for the P-256 curve, hence the routine is limited to P-256. |
| * The routine does not return a boolean result but computes the left side |
| * and the right sight of the Weierstrass equation and leaves the final |
| * comparison to the caller. |
| * The routine runs in constant time. |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * @param[in] dmem[x]: affine x-coordinate of input point |
| * @param[in] dmem[y]: affine y-coordinate of input point |
| * @param[out] dmem[r]: right side result r |
| * @param[out] dmem[s]: left side result s |
| * |
| * clobbered registers: x2, x3, x19, x20, w0, w19 to w25 |
| * clobbered flag groups: FG0 |
| */ |
| p256_isoncurve: |
| |
| /* setup all-zero reg */ |
| bn.xor w31, w31, w31 |
| |
| /* setup modulus p and Barrett constant u |
| MOD <= w29 <= dmem[p256_p] = p; w28 <= dmem[p256_u_p] = u_p */ |
| li x2, 29 |
| la x3, p256_p |
| bn.lid x2, 0(x3) |
| bn.wsrw 0, w29 |
| li x2, 28 |
| la x3, p256_u_p |
| bn.lid x2, 0(x3) |
| |
| /* load domain parameter b from dmem |
| w27 <= b = dmem[p256_b] */ |
| li x2, 27 |
| la x3, p256_b |
| bn.lid x2, 0(x3) |
| |
| /* load affine y-coordinate of curve point from dmem |
| w26 <= dmem[y] */ |
| la x3, y |
| li x2, 24 |
| bn.lid x2, 0(x3) |
| |
| /* w19 <= y^2 = w24*w24 */ |
| bn.mov w25, w24 |
| jal x1, mod_mul_256x256 |
| |
| /* store left side result: dmem[s] <= w19 = y^2 mod p */ |
| la x20, s |
| li x2, 19 |
| bn.sid x2, 0(x20) |
| |
| /* load affine x-coordinate of curve point from dmem |
| w26 <= dmem[x] */ |
| la x3, x |
| li x2, 26 |
| bn.lid x2, 0(x3) |
| |
| /* w19 <= x^2 = w26*w26 */ |
| bn.mov w25, w26 |
| bn.mov w24, w26 |
| jal x1, mod_mul_256x256 |
| |
| /* w19 = x^3 <= x^2 * x = w25*w24 = w26*w19 */ |
| bn.mov w25, w19 |
| bn.mov w24, w26 |
| jal x1, mod_mul_256x256 |
| |
| /* for curve P-256, 'a' can be written as a = -3, therefore we subtract |
| x three times from x^3. |
| w19 = x^3 + ax <= x^3 - 3x mod p */ |
| bn.subm w19, w19, w26 |
| bn.subm w19, w19, w26 |
| bn.subm w19, w19, w26 |
| |
| /* w24 <= x^3 + ax + b mod p = w19 + w27 mod p */ |
| bn.addm w19, w19, w27 |
| |
| /* store right side result: dmem[r] <= w19 = x^3 + ax + b mod p */ |
| la x19, r |
| li x2, 19 |
| bn.sid x2, 0(x19) |
| |
| ret |
| |
| |
| /** |
| * P-256 point addition in projective coordinates |
| * |
| * returns R = (x_r, y_r, z_r) <= P+Q = (x_p, y_p, z_p) + (x_q, y_q, z_q) |
| * with R, P and Q being valid P-256 curve points |
| * in projective coordinates |
| * |
| * This routine adds two valid P-256 curve points in projective space. |
| * Point addition is performed based on the complete formulas of Bosma and |
| * Lenstra for Weierstrass curves as first published in [1] and |
| * optimized in [2]. |
| * The implemented version follows Algorithm 4 of [2] which is an optimized |
| * variant for Weierstrass curves with domain parameter 'a' set to a=-3. |
| * Numbering of the steps below and naming of symbols follows the |
| * terminology of Algorithm 4 of [2]. |
| * The routine is limited to P-256 curve points due to: |
| * - fixed a=-3 domain parameter |
| * - usage of a P-256 optimized Barrett multiplication kernel |
| * This routine runs in constant time. |
| * |
| * [1] https://doi.org/10.1006/jnth.1995.1088 |
| * [2] https://doi.org/10.1007/978-3-662-49890-3_16 |
| * |
| * @param[in] w8: x_p, x-coordinate of input point P |
| * @param[in] w9: y_p, y-coordinate of input point P |
| * @param[in] w10: z_p, z-coordinate of input point P |
| * @param[in] w11: x_q, x-coordinate of input point Q |
| * @param[in] w12: y_q, x-coordinate of input point Q |
| * @param[in] w13: z_q, x-coordinate of input point Q |
| * @param[in] w27: b, curve domain parameter |
| * @param[in] w29: p, modulus, 2^256 > p > 2^255. |
| * @param[in] w28: u, pre-computed Barrett constant (without u[256]/MSb |
| * of u which is always 1 for the allowed range. |
| * @param[in] w31: all-zero. |
| * @param[in] MOD: p, modulus, 2^256 > p > 2^255. |
| * @param[out] w11: x_r, x-coordinate of resulting point R |
| * @param[out] w12: y_r, x-coordinate of resulting point R |
| * @param[out] w13: z_r, x-coordinate of resulting point R |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * clobbered registers: w11 to w25 |
| * clobbered flag groups: FG0 |
| */ |
| proj_add: |
| /* mapping of parameters to symbols of [2] (Algorithm 4): |
| X1 = x_p; Y1 = y_p; Z1 = z_p; X2 = x_q; Y2 = y_q; Z2 = z_q |
| X3 = x_r; Y3 = y_r; Z3 = z_r */ |
| |
| /* 1: w14 = t0 <= X1*X2 = w11*w8 */ |
| bn.mov w24, w11 |
| bn.mov w25, w8 |
| jal x1, mod_mul_256x256 |
| bn.mov w14, w19 |
| |
| /* 2: w15 = t1 <= Y1*Y2 = w12*w9 */ |
| bn.mov w24, w12 |
| bn.mov w25, w9 |
| jal x1, mod_mul_256x256 |
| bn.mov w15, w19 |
| |
| /* 3: w16 = t2 <= Z1*Z2 = w13*w10*/ |
| bn.mov w24, w13 |
| bn.mov w25, w10 |
| jal x1, mod_mul_256x256 |
| bn.mov w16, w19 |
| |
| /* 5: w17 = t4 <= X2+Y2 = w11 + w12 */ |
| bn.addm w17, w11, w12 |
| |
| /* 4: w18 = t3 <= X1+Y1 = w8+w9 */ |
| bn.addm w18, w8, w9 |
| |
| /* 6: w19 = t3 <= t3*t4 = w18*w17 */ |
| bn.mov w24, w17 |
| bn.mov w25, w18 |
| jal x1, mod_mul_256x256 |
| |
| /* 7: w18 = t4 <= t0+t1 = w14+w15 */ |
| bn.addm w18, w14, w15 |
| |
| /* 8: w17 = t3 <= t3 - t4 = w19 - w18 */ |
| bn.subm w17, w19, w18 |
| |
| /* 10: w18 = X3 <= Y2 + Z2 = w12 + w13 */ |
| bn.addm w18, w12, w13 |
| |
| /* 9: w19 = t4 <= Y1 + Z1 = w9 + w10 */ |
| bn.addm w19, w9, w10 |
| |
| /* 11: w18 = t4 <= t4 * X3 = w19 * w18 */ |
| bn.mov w24, w18 |
| bn.mov w25, w19 |
| jal x1, mod_mul_256x256 |
| bn.mov w18, w19 |
| |
| /* 12: w19 = X3 <= t1 + t2 = w15 + w16 */ |
| bn.addm w19, w15, w16 |
| |
| /* 13: w18 = t4 <= t4 - X3 = w18 + w19 */ |
| bn.subm w18, w18, w19 |
| |
| /* 15: w19 = Y3 <= X2 + Z2 = w11 + w13 */ |
| bn.addm w19, w11, w13 |
| |
| /* 14: w12 = X3 <= X1 + Z1 = w8 + w10 */ |
| bn.addm w12, w8, w10 |
| |
| /* 16: w11 = X3 <= X3 * Y3 = w12 * w19 */ |
| bn.mov w24, w19 |
| bn.mov w25, w12 |
| jal x1, mod_mul_256x256 |
| bn.mov w11, w19 |
| |
| /* 17: w12 = Y3 <= t0 + t2 = w14 + w16 */ |
| bn.addm w12, w14, w16 |
| |
| /* 18: w12 = Y3 <= X3 - Y3 = w11 - w12 */ |
| bn.subm w12, w11, w12 |
| |
| /* 19: w19 = Z3 <= b * t2 = w27 * w16 */ |
| bn.mov w24, w27 |
| bn.mov w25, w16 |
| jal x1, mod_mul_256x256 |
| |
| /* 20: w11 = X3 <= Y3 -Z3 = w12 - w19 */ |
| bn.subm w11, w12, w19 |
| |
| /* 21: w13 = Z3 <= X3 + X3 = w11 + w11 */ |
| bn.addm w13, w11, w11 |
| |
| /* 22: w11 = X3 <= w11 + w13 = X3 + Z3 */ |
| bn.addm w11, w11, w13 |
| |
| /* 23: w13 = Z3 <= t1 - X3 = w15 - w11 */ |
| bn.subm w13, w15, w11 |
| |
| /* 24: w11 = X3 <= t1 + X3 = w15 + w11 */ |
| bn.addm w11, w15, w11 |
| |
| /* 25: w19 = Y3 <= w27 * w12 = b * Y3 */ |
| bn.mov w24, w27 |
| bn.mov w25, w12 |
| jal x1, mod_mul_256x256 |
| |
| /* 26: w15 = t1 <= t2 + t2 = w16 + w16 */ |
| bn.addm w15, w16, w16 |
| |
| /* 27: w16 = t2 <= t1 + t2 = w15 + w16 */ |
| bn.addm w16, w15, w16 |
| |
| /* 28: w12 = Y3 <= Y3 - t2 = w19 - w16 */ |
| bn.subm w12, w19, w16 |
| |
| /* 29: w12 = Y3 <= Y3 - t0 = w12 - w14 */ |
| bn.subm w12, w12, w14 |
| |
| /* 30: w15 = t1 <= Y3 + Y3 = w12 + w12 */ |
| bn.addm w15, w12, w12 |
| |
| /* 31: w12 = Y3 <= t1 + Y3 = w15 + w12*/ |
| bn.addm w12, w15, w12 |
| |
| /* 32: w15 = t1 <= t0 + t0 = w14 + w14 */ |
| bn.addm w15, w14, w14 |
| |
| /* 33: w14 = t0 <= t1 + t0 = w15 + w14 */ |
| bn.addm w14, w15, w14 |
| |
| /* 34: w14 = t0 <= t0 - t2 = w14 - w16 */ |
| bn.subm w14, w14, w16 |
| |
| /* 35: w15 = t1 <= t4 * Y3 = w18 * w12 */ |
| bn.mov w24, w18 |
| bn.mov w25, w12 |
| jal x1, mod_mul_256x256 |
| bn.mov w15, w19 |
| |
| /* 36: w16 = t2 <= t0 * Y3 = w14 * w12 */ |
| bn.mov w24, w14 |
| bn.mov w25, w12 |
| jal x1, mod_mul_256x256 |
| bn.mov w16, w19 |
| |
| /* 37: w12 = Y3 <= X3 * Z3 = w11 * w13 */ |
| bn.mov w24, w11 |
| bn.mov w25, w13 |
| jal x1, mod_mul_256x256 |
| |
| /* 38: w12 = Y3 <= Y3 + t2 = w19 + w16 */ |
| bn.addm w12, w19, w16 |
| |
| /* 39: w19 = X3 <= t3 * X3 = w17 * w11 */ |
| bn.mov w24, w17 |
| bn.mov w25, w11 |
| jal x1, mod_mul_256x256 |
| |
| /* 40: w11 = X3 <= X3 - t1 = w19 - w15 */ |
| bn.subm w11, w19, w15 |
| |
| /* 41: w13 = Z3 <= t4 * Z3 = w18 * w13 */ |
| bn.mov w24, w18 |
| bn.mov w25, w13 |
| jal x1, mod_mul_256x256 |
| bn.mov w13, w19 |
| |
| /* 42: w19 = t1 <= t3 * t0 = w17 * w14 */ |
| bn.mov w24, w17 |
| bn.mov w25, w14 |
| jal x1, mod_mul_256x256 |
| |
| /* 43: w13 = Z3 <= Z3 + t1 = w13 + w19 */ |
| bn.addm w13, w13, w19 |
| |
| ret |
| |
| |
| /** |
| * Convert projective coordinates of a P-256 curve point to affine coordinates |
| * |
| * returns P = (x_a, y_a) = (x/z mod p, y/z mod p) |
| * with P being a valid P-256 curve point |
| * x_a and y_a being the affine coordinates of said curve point |
| * x, y and z being a set of projective coordinates of said point |
| * and p being the modulus of the P-256 underlying finite field. |
| * |
| * This routine computes the affine coordinates for a set of projective |
| * coordinates of a valid P-256 curve point. The routine performs the required |
| * divisions by computing the multiplicative modular inverse of the |
| * projective z-coordinate in the underlying finite field of the P-256 curve. |
| * For inverse computation Fermat's little theorem is used, i.e. |
| * we compute z^-1 = z^(p-2) mod p. |
| * For exponentiation a 27 step addition chain is used. |
| * This addition chain is (presumably?) the shortest addition chain known as |
| * of today for the exponent exp = p - 2 = 2^256 - 2^224 + 2^6 + 2^64 - 1 - 2. |
| * Origin of the chain can not fully be traced. [1] attributes it to a specific |
| * patch to OpenSLL. The same addition chain is used in the OpenSSL and |
| * BoringSSL crypto libraries. |
| * This routine runs in constant time. |
| * |
| * [1] https://doi.org/10.1007/s13389-014-0090-x |
| * https://eprint.iacr.org/2013/816.pdf |
| * |
| * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on |
| * the computed affine y-coordinate. |
| * |
| * @param[in] w8: x, x-coordinate of curve point (projective) |
| * @param[in] w9: y, y-coordinate of curve point (projective) |
| * @param[in] w10: z, z-coordinate of curve point (projective) |
| * @param[in] w29: p, modulus, 2^256 > p > 2^255. |
| * @param[in] w28: u, pre-computed Barrett constant (without u[256]/MSb |
| * of u which is always 1 for the allowed range. |
| * @param[in] MOD: p, modulus of the finite field of P-256 |
| * @param[out] w11: x_a, x-coordinate of curve point (affine) |
| * @param[out] w12: y_a, y-coordinate of curve point (affine) |
| * |
| * clobbered registers: w10 to w19, w24, w25 |
| * clobbered flag groups: FG0 |
| */ |
| proj_to_affine: |
| |
| /* 1: exp = 0x1 */ |
| bn.addm w10, w10, w31 |
| |
| /* 2: exp = 0x2 = 2*0x1 */ |
| bn.mov w24, w10 |
| bn.mov w25, w10 |
| jal x1, mod_mul_256x256 |
| |
| /* 3: exp = 0x3 = 0x2+0x1 */ |
| bn.mov w24, w19 |
| bn.mov w25, w10 |
| jal x1, mod_mul_256x256 |
| bn.mov w12, w19 |
| |
| /* 4: exp = 0x6 = 2*0x3 */ |
| bn.mov w24, w19 |
| bn.mov w25, w19 |
| jal x1, mod_mul_256x256 |
| |
| /* 5: exp = 0xc = 2*0x6 */ |
| bn.mov w24, w19 |
| bn.mov w25, w19 |
| jal x1, mod_mul_256x256 |
| |
| /* 6: exp = 0xf = 0xc+0x3 */ |
| bn.mov w24, w19 |
| bn.mov w25, w12 |
| jal x1, mod_mul_256x256 |
| bn.mov w13, w19 |
| |
| /* 7: exp = 0xf0 = 16*0xf */ |
| loopi 4, 4 |
| bn.mov w24, w19 |
| bn.mov w25, w19 |
| jal x1, mod_mul_256x256 |
| nop |
| |
| /* 8: exp = 0xff = 0xf0+0xf */ |
| bn.mov w24, w19 |
| bn.mov w25, w13 |
| jal x1, mod_mul_256x256 |
| bn.mov w14, w19 |
| |
| /* 9: exp = 0xff00 = 256*0xff */ |
| loopi 8, 4 |
| bn.mov w24, w19 |
| bn.mov w25, w19 |
| jal x1, mod_mul_256x256 |
| nop |
| |
| /* 10: exp = 0xffff = 0xff00+0xff */ |
| bn.mov w24, w19 |
| bn.mov w25, w14 |
| jal x1, mod_mul_256x256 |
| bn.mov w15, w19 |
| |
| /* 11: exp = 0xffff0000 = 2^16*0xffff */ |
| loopi 16, 4 |
| bn.mov w24, w19 |
| bn.mov w25, w19 |
| jal x1, mod_mul_256x256 |
| nop |
| |
| /* 12: exp = 0xffffffff = 0xffff0000+0xffff */ |
| bn.mov w24, w19 |
| bn.mov w25, w15 |
| jal x1, mod_mul_256x256 |
| bn.mov w16, w19 |
| |
| /* 13: exp = 0xffffffff00000000 = 2^32*0xffffffff */ |
| loopi 32, 4 |
| bn.mov w24, w19 |
| bn.mov w25, w19 |
| jal x1, mod_mul_256x256 |
| nop |
| bn.mov w17, w19 |
| |
| /* 14: exp = 0xffffffff00000001 = 0xffffffff00000000+0x1 */ |
| bn.mov w24, w10 |
| bn.mov w25, w19 |
| jal x1, mod_mul_256x256 |
| |
| /* 15: exp = |
| 0xffffffff00000001000000000000000000000000000000000000000000000000 |
| = 2^192*0xffffffff00000001 */ |
| loopi 192, 4 |
| bn.mov w24, w19 |
| bn.mov w25, w19 |
| jal x1, mod_mul_256x256 |
| nop |
| bn.mov w18, w19 |
| |
| /* 16: exp = 0xffffffffffffffff = 0xffffffff00000000+0xffffffff */ |
| bn.mov w24, w17 |
| bn.mov w25, w16 |
| jal x1, mod_mul_256x256 |
| |
| /* 17: exp = 0xffffffffffffffff0000 = 2^16*0xffffffffffffffff */ |
| loopi 16, 4 |
| bn.mov w24, w19 |
| bn.mov w25, w19 |
| jal x1, mod_mul_256x256 |
| nop |
| |
| /* 18: exp = 0xffffffffffffffffffff = 0xffffffffffffffff0000+0xffff */ |
| bn.mov w24, w15 |
| bn.mov w25, w19 |
| jal x1, mod_mul_256x256 |
| |
| /* 19: exp = 0xffffffffffffffffffff00 = 256*0xffffffffffffffffffff */ |
| loopi 8, 4 |
| bn.mov w24, w19 |
| bn.mov w25, w19 |
| jal x1, mod_mul_256x256 |
| nop |
| |
| /* 20: exp = 0xffffffffffffffffffffff = 0xffffffffffffffffffff00+0xff */ |
| bn.mov w24, w14 |
| bn.mov w25, w19 |
| jal x1, mod_mul_256x256 |
| |
| /* 21: exp = 0xffffffffffffffffffffff0 = 16*0xffffffffffffffffffffff */ |
| loopi 4, 4 |
| bn.mov w24, w19 |
| bn.mov w25, w19 |
| jal x1, mod_mul_256x256 |
| nop |
| |
| /* 22: exp = 0xfffffffffffffffffffffff = 0xffffffffffffffffffffff0+0xf */ |
| bn.mov w24, w13 |
| bn.mov w25, w19 |
| jal x1, mod_mul_256x256 |
| |
| /* 23: exp = 0x3ffffffffffffffffffffffc = 4*0xfffffffffffffffffffffff */ |
| loopi 2, 4 |
| bn.mov w24, w19 |
| bn.mov w25, w19 |
| jal x1, mod_mul_256x256 |
| nop |
| |
| /* 24: exp = 0x3fffffffffffffffffffffff = 0x3ffffffffffffffffffffffc+0x3 */ |
| bn.mov w24, w12 |
| bn.mov w25, w19 |
| jal x1, mod_mul_256x256 |
| |
| /* 25: exp = 0xfffffffffffffffffffffffc = 4*0x3fffffffffffffffffffffff */ |
| loopi 2, 4 |
| bn.mov w24, w19 |
| bn.mov w25, w19 |
| jal x1, mod_mul_256x256 |
| nop |
| |
| /* 26: exp = 0xfffffffffffffffffffffffd = 0xfffffffffffffffffffffffc+0x1 */ |
| bn.mov w24, w10 |
| bn.mov w25, w19 |
| jal x1, mod_mul_256x256 |
| |
| /* 27: exp = p-2 |
| = 0xffffffff00000001000000000000000000000000fffffffffffffffffffffffd |
| = 0xfffffffffffffffffffffffd |
| + 0xffffffff00000001000000000000000000000000000000000000000000000000 |
| w14 = z^exp = z^(p-2) = z^-1 mod p */ |
| bn.mov w24, w19 |
| bn.mov w25, w18 |
| jal x1, mod_mul_256x256 |
| bn.mov w14, w19 |
| |
| /* convert x-coordinate to affine |
| w11 = x_a = x/z = x * z^(-1) = w8 * w14 */ |
| bn.mov w24, w8 |
| bn.mov w25, w14 |
| jal x1, mod_mul_256x256 |
| bn.mov w11, w19 |
| |
| /* convert y-coordinate to affine |
| w12 = y_a = y/z = y * z^(-1) = w9 * w14 */ |
| bn.mov w24, w9 |
| bn.mov w25, w14 |
| jal x1, mod_mul_256x256 |
| bn.mov w12, w19 |
| |
| ret |
| |
| |
| /** |
| * Variable time modular multiplicative inverse computation |
| * |
| * returns x_inv = x^-1 mod m |
| * |
| * This routine computes the modular multiplicative inverse for any x < m in |
| * the field GF(m). |
| * For inverse computation Fermat's little theorem is used, i.e. |
| * we compute x^-1 = x^(m-2) mod m. |
| * For exponentiation we use a standard, variable time square and multiply |
| * algorithm. (The timing varies based on the modulus alone, not the input x, |
| * so it is safe to use for a non-secret modulus and a secret operand). |
| * |
| * @param[in] w0: x, a 256 bit operand with x < m |
| * @param[in] w29: m, modulus, 2^256 > m > 2^255. |
| * @param[in] w28: u, lower 256 bit of pre-computed Barrett constant |
| * @param[in] w31, all-zero |
| * @param[in] MOD: m, modulus |
| * @param[out] w1: x_inv, modular multiplicative inverse |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * clobbered registers: w1, w2, w3, w19, w24, w25 |
| * clobbered flag groups: FG0 |
| */ |
| mod_inv: |
| |
| /* subtract 2 from modulus for Fermat's little theorem |
| w2 = MOD - 2 = m - 2 */ |
| bn.wsrr w2, 0 |
| bn.subi w2, w2, 2 |
| |
| /* init square and multiply: w1 = 1 */ |
| bn.addi w1, w31, 1 |
| |
| /* square and multiply loop */ |
| loopi 256, 14 |
| |
| /* square: w3 = w19 = w24*w25 = w1^2 mod m */ |
| bn.mov w24, w1 |
| bn.mov w25, w1 |
| jal x1, mod_mul_256x256 |
| bn.mov w3, w19 |
| |
| /* shift MSB into carry flag |
| w2 = 2*w2 = w2 << 1 */ |
| bn.add w2, w2, w2 |
| |
| /* skip multiplication if C flag not set */ |
| bn.sel w1, w1, w3, C |
| csrrs x2, 0x7c0, x0 |
| andi x2, x2, 1 |
| beq x2, x0, nomul |
| |
| /* multiply: w1 = w19 = w24*w25 = w3*w0 mod m */ |
| bn.mov w24, w3 |
| bn.mov w25, w0 |
| jal x1, mod_mul_256x256 |
| bn.mov w1, w19 |
| |
| nomul: |
| nop |
| |
| ret |
| |
| |
| /** |
| * Fetch curve point from dmem and randomize z-coordinate |
| * |
| * returns P = (x, y, z) = (x_a*z, y_a*z, z) |
| * with P being a valid P-256 curve point in projective coordinates |
| * x_a and y_a being the affine coordinates as fetched from dmem |
| * z being a randomized z-coordinate |
| * |
| * This routines fetches the affine x- and y-coordinates of a curve point from |
| * dmem and computes a valid set of projective coordinates. The z-coordinate is |
| * randomized and x and y are scaled appropriately. |
| * This routine runs in constant time. |
| * |
| * @param[in] x10: constant 24 |
| * @param[in] x21: dptr_x, pointer to dmem location containing affine |
| * x-coordinate of input point |
| * @param[in] x22: dptr_y, pointer to dmem location containing affine |
| * y-coordinate of input point |
| * @param[in] w28: u, lower 256 bit of Barrett constant for curve P-256 |
| * @param[in] w29: p, modulus of P-256 underlying finite field |
| * @param[in] w31: all-zero |
| * @param[in] MOD: p, modulus of P-256 underlying finite field |
| * @param[out] w14: x, projective x-coordinate |
| * @param[out] w15: y, projective y-coordinate |
| * @param[out] w16: z, random projective z-coordinate |
| * |
| * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on |
| * the scaled projective y-coordinate. |
| * |
| * clobbered registers: w14 to w16, w19 to w26 |
| * clobbered flag groups: FG0 |
| */ |
| fetch_proj_randomize: |
| |
| /* get random number from URND */ |
| bn.wsrr w16, 2 /* URND */ |
| |
| /* reduce random number |
| w16 = z <= w16 mod p */ |
| bn.addm w16, w16, w31 |
| |
| /* fetch x-coordinate from dmem |
| w24 = x_a <= dmem[x22] = dmem[dptr_x] */ |
| bn.lid x10, 0(x21) |
| |
| /* scale x-coordinate |
| w14 = x <= w24*w16 = x_a*z mod p */ |
| bn.mov w25, w16 |
| jal x1, mod_mul_256x256 |
| bn.mov w14, w19 |
| |
| /* fetch y-coordinate from dmem |
| w24 = y_a <= dmem[x22] = dmem[dptr_y] */ |
| bn.lid x10, 0(x22) |
| |
| /* scale y-coordinate |
| w15 = y <= w24*w16 = y_a*z mod p */ |
| bn.mov w25, w16 |
| jal x1, mod_mul_256x256 |
| bn.mov w15, w19 |
| |
| ret |
| |
| |
| /** |
| * P-256 point doubling in projective space |
| * |
| * returns R = (x_r, y_r, z_r) <= 2*P = 2*(x_p, y_p, z_p) |
| * with R, P being valid P-256 curve points |
| * |
| * This routines doubles a given P-256 curve point in projective coordinate. |
| * Internally this routine makes use of the point addition routine and |
| * adds the point to itself. |
| * This routine runs in constant time. |
| * |
| * @param[in] w8: x_p, x-coordinate of input point |
| * @param[in] w9: y_p, y-coordinate of input point |
| * @param[in] w10: z_p, z-coordinate of input point |
| * @param[in] w27: b, curve domain parameter |
| * @param[in] w29: p, p, modulus of P-256 underlying finite field |
| * @param[in] w28: u, u, lower 256 bit of Barrett constant for curve P-256 |
| * @param[in] w31: all-zero. |
| * @param[in] MOD: p, modulus of P-256 underlying finite field |
| * @param[out] w11: x_r, x-coordinate of resulting point |
| * @param[out] w12: y_r, y-coordinate of resulting point |
| * @param[out] w13: z_r, z-coordinate of resulting point |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * clobbered registers: w11 to w25 |
| * clobbered flag groups: FG0 |
| */ |
| proj_double: |
| |
| /* Q = (x_q, y_q, z_q) = (w11, w12, w13) <= P = (x_p, y_p, z_p) |
| = (w8, w9, w10) */ |
| bn.mov w11, w8 |
| bn.mov w12, w9 |
| bn.mov w13, w10 |
| |
| /* R = (x_r, y_r, z_r) = (w11, w12, w13) = P+Q |
| = (w8, w9, w10) + (w11, w12, w13) = (x_p, y_p, z_p) + (x_q, y_q, z_q) */ |
| jal x1, proj_add |
| |
| ret |
| |
| |
| /** |
| * P-256 scalar point multiplication in affine space |
| * |
| * returns R = k*P = k*(x_p, y_p) |
| * with R, P being valid P-256 curve points in affine coordinates |
| * k being a 256 bit scalar |
| * |
| * This routine performs scalar multiplication based on the group laws |
| * of Weierstrass curves. |
| * A constant time double-and-add algorithm (sometimes referred to as |
| * double-and-add-always) is used. |
| * Due to the P-256 optimized implementations of the called routines for |
| * point addition and doubling, this routine is limited to P-256. |
| * |
| * The routine receives the scalar in two shares k0, k1 such that |
| * k = (k0 + k1) mod n |
| * The loop operates on both shares in parallel, computing (k0 + k1) * P as |
| * follows: |
| * Q = (0, 1, 0) # origin |
| * for i in 319..0: |
| * Q = 2 * Q |
| * A = if (k0[i] ^ k1[i]) then P else 2P |
| * B = Q + A |
| * Q = if (k0[i] | k1[i]) then B else Q |
| * |
| * |
| * Each share k0/k1 is 320 bits, even though it represents a 256-bit value. |
| * This is a side-channel protection measure. |
| * |
| * @param[in] x21: dptr_x, pointer to affine x-coordinate in dmem |
| * @param[in] x22: dptr_y, pointer to affine y-coordinate in dmem |
| * @param[in] w0: lower 256 bits of k0, first share of scalar |
| * @param[in] w1: upper 64 bits of k0, first share of scalar |
| * @param[in] w2: lower 256 bits of k1, second share of scalar |
| * @param[in] w3: upper 64 bits of k1, second share of scalar |
| * @param[in] w27: b, curve domain parameter |
| * @param[in] w31: all-zero |
| * @param[in] MOD: p, modulus, 2^256 > p > 2^255. |
| * @param[out] w11: x_r, affine x-coordinate of resulting point |
| * @param[out] w12: y_r, affine y-coordinate of resulting point |
| * |
| * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on |
| * the computed affine y-coordinate. |
| * |
| * clobbered registers: x2, x3, x10, w0 to w30 |
| * clobbered flag groups: FG0 |
| */ |
| scalar_mult_int: |
| |
| /* load field modulus p from dmem |
| w29 <= p = dmem[p256_p] */ |
| li x2, 29 |
| la x3, p256_p |
| bn.lid x2, 0(x3) |
| |
| /* store modulus to MOD WSR */ |
| bn.wsrw 0, w29 |
| |
| /* load lower 256 bit of Barrett constant u for modulus p from dmem |
| w28 <= u = dmem[p256_u_p] */ |
| li x2, 28 |
| la x3, p256_u_p |
| bn.lid x2, 0(x3) |
| |
| /* load domain parameter b from dmem |
| w27 <= b = dmem[p256_b] */ |
| li x2, 27 |
| la x3, p256_b |
| bn.lid x2, 0(x3) |
| |
| /* get randomized projective coodinates of curve point |
| P = (x_p, y_p, z_p) = (w8, w9, w10) = (w14, w15, w16) = |
| (x*z mod p, y*z mod p, z) */ |
| li x10, 24 |
| jal x1, fetch_proj_randomize |
| bn.mov w8, w14 |
| bn.mov w9, w15 |
| bn.mov w10, w16 |
| |
| /* Init 2P, this will be used for the addition part in the double-and-add |
| loop when the bit at the current index is 1 for both shares of the scalar. |
| 2P = (w4, w5, w6) <= (w11, w12, w13) <= 2*(w8, w9, w10) = 2*P */ |
| jal x1, proj_double |
| bn.mov w4, w11 |
| bn.mov w5, w12 |
| bn.mov w6, w13 |
| |
| /* init double-and-add with point in infinity |
| Q = (w8, w9, w10) <= (0, 1, 0) */ |
| bn.mov w8, w31 |
| bn.addi w9, w31, 1 |
| bn.mov w10, w31 |
| |
| /* Shift shares of k so their MSBs are in the most significant position of a |
| word. |
| w0,w1 <= [w0, w1] << 192 = k0 << 192 |
| w2,w3 <= [w2, w3] << 192 = k1 << 192 */ |
| bn.rshi w1, w1, w0 >> 64 |
| bn.rshi w0, w0, w31 >> 64 |
| bn.rshi w3, w3, w2 >> 64 |
| bn.rshi w2, w2, w31 >> 64 |
| |
| /* double-and-add loop with decreasing index */ |
| loopi 320, 34 |
| |
| /* double point Q |
| Q = (w11, w12, w13) <= 2*(w8, w9, w10) = 2*Q */ |
| jal x1, proj_double |
| |
| /* re-fetch and randomize P again |
| P = (w14, w15, w16) */ |
| jal x1, fetch_proj_randomize |
| |
| /* probe if MSb of either of the two scalars (k0 or k1) but not both is 1. |
| - If only one MSb is set, select P for addition |
| - If both MSbs are set, select 2P for addition |
| - If neither MSB is set, also 2P will be selected but this will be |
| discarded later */ |
| bn.xor w20, w1, w3 |
| |
| /* N.B. The M bit here is secret. For side channel protection in the |
| selects below, it is vital that neither option is equal to the |
| destionation register (e.g. bn.sel w0, w0, w1). In this case, the |
| hamming distance from the destination's previous value to its new value |
| will be 0 in one of the cases and potentially reveal M. |
| |
| P = (w8, w9, w10) |
| <= (w0[255] xor w1[255])?P=(w14, w15, w16):2P=(w4, w5, w6) */ |
| bn.sel w8, w14, w4, M |
| bn.sel w9, w15, w5, M |
| bn.sel w10, w16, w6, M |
| |
| /* save doubling result to survive follow-up subroutine call |
| Q = (w7, w26, w30) <= (w11, w12, w13) */ |
| bn.mov w7, w11 |
| bn.mov w26, w12 |
| bn.mov w30, w13 |
| |
| /* add points |
| Q+P = (w11, w12, w13) <= (w11, w12, w13) + (w8, w9, w10) */ |
| jal x1, proj_add |
| |
| /* probe if MSb of either one or both of the two |
| scalars (k0 or k1) is 1.*/ |
| bn.or w20, w1, w3 |
| |
| /* N.B. As before, the select instructions below must use distinct |
| source/destination registers to avoid revealing M. |
| |
| Select doubling result (Q) or addition result (Q+P) |
| Q = w0[255] or w1[255]?Q_a=(w11, w12, w13):Q=(w7, w26, w30) */ |
| bn.sel w8, w11, w7, M |
| bn.sel w9, w12, w26, M |
| bn.sel w10, w13, w30, M |
| |
| /* Shift both scalars left 1 bit. */ |
| bn.rshi w1, w1, w0 >> 255 |
| bn.rshi w0, w0, w31 >> 255 |
| bn.rshi w3, w3, w2 >> 255 |
| bn.rshi w2, w2, w31 >> 255 |
| |
| /* init regs with random numbers from URND */ |
| bn.wsrr w11, 2 |
| bn.wsrr w12, 2 |
| bn.wsrr w13, 2 |
| |
| /* get a fresh random number from URND and scale the coordinates of |
| 2P = (w3, w4, w5) (scaling each projective coordinate with same |
| factor results in same point) */ |
| bn.wsrr w7, 2 |
| |
| /* w4 = w4 * w7 */ |
| bn.mov w24, w4 |
| bn.mov w25, w7 |
| jal x1, mod_mul_256x256 |
| bn.mov w4, w19 |
| |
| /* w5 = w5 * w7 */ |
| bn.mov w24, w5 |
| bn.mov w25, w7 |
| jal x1, mod_mul_256x256 |
| bn.mov w5, w19 |
| |
| /* w6 = w6 * w7 */ |
| bn.mov w24, w6 |
| bn.mov w25, w7 |
| jal x1, mod_mul_256x256 |
| bn.mov w6, w19 |
| |
| /* Check if the z-coordinate of Q is 0. If so, fail; this represents the |
| point at infinity and means the scalar was zero mod n, which likely |
| indicates a fault attack. |
| |
| FG0.Z <= if (w10 == 0) then 1 else 0 */ |
| bn.cmp w10, w31 |
| jal x1, trigger_fault_if_fg0_z |
| |
| /* convert back to affine coordinates |
| R = (x_a, y_a) = (w11, w12) */ |
| jal x1, proj_to_affine |
| |
| ret |
| |
| |
| /** |
| * P-256 ECDSA signature generation |
| * |
| * returns the signature as the pair r, s with |
| * r = x_1 mod n |
| * and s = k^(-1)(msg + r*d) mod n |
| * with x_1 being the affine x-coordinate of the curve point k*G, |
| * where G is the curve's base point. |
| * k being a supplied secret random number, |
| * n being the order of the base point G of P-256, |
| * msg being the msg to be signed, |
| * d being the private key. |
| * |
| * This routine runs in constant time. |
| * |
| * Note: Some versions of the ECDSA spec suggest that msg must be reduced |
| * modulo n (e.g. RFC 6979, section 2.4). However, for this implementation, it |
| * is sufficient that msg < 2^256, because the message is multiplied with |
| * k^(-1) mod n, and our Barrett multiplication implementation accepts any |
| * operands a and b such that a * b < 2^256 * p and fully reduces the result. |
| * |
| * This routine assumes that the secret scalars d and k are provided in two |
| * shares each (d0/d1 and k0/k1 respectively), where |
| * d = (d0 + d1) mod n |
| * k = (k0 + k1) mod n |
| * |
| * Each share is 320 bits, which gives us 64 bits of extra redundancy modulo n |
| * (256 bits). This is a protection measure against side-channel attacks. |
| * |
| * For s = k^-1 * (r * d + msg), we compute a random nonzero masking scalar |
| * alpha, and compute s as: |
| * s = ((k * alpha)^-1 * (r * (d * alpha) + alpha * msg)) mod n |
| * |
| * We choose alpha to be at most 128 bits, so the product with a 320b share |
| * produces fits in the same 512-bit modular reduction routine that we use for |
| * 256x256-bit multiplications. It should be safe to compute e.g. k * alpha = |
| * (k0 * alpha + k1 * alpha) mod n, because alpha has enough randomness to mask |
| * the true value of k. |
| * |
| * @param[in] dmem[k0]: first share of secret scalar (320 bits) |
| * @param[in] dmem[k1]: second share of secret scalar (320 bits) |
| * @param[in] dmem[msg]: message to be signed (256 bits) |
| * @param[in] dmem[r]: dmem buffer for r component of signature (256 bits) |
| * @param[in] dmem[s]: dmem buffer for s component of signature (256 bits) |
| * @param[in] dmem[d0]: first share of private key d (320 bits) |
| * @param[in] dmem[d1]: second share of private key d (320 bits) |
| * |
| * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on |
| * the computed affine y-coordinate. |
| * |
| * clobbered registers: x2, x3, x16 to x23, w0 to w26 |
| * clobbered flag groups: FG0 |
| */ |
| p256_sign: |
| |
| /* init all-zero register */ |
| bn.xor w31, w31, w31 |
| |
| /* load first share of secret scalar k from dmem: w0,w1 = dmem[k0] */ |
| la x16, k0 |
| li x2, 0 |
| bn.lid x2, 0(x16++) |
| li x2, 1 |
| bn.lid x2, 0(x16) |
| |
| /* load second share of secret scalar k from dmem: w2,w3 = dmem[k1] */ |
| la x16, k1 |
| li x2, 2 |
| bn.lid x2, 0(x16++) |
| li x2, 3 |
| bn.lid x2, 0(x16) |
| |
| /* setup modulus n (curve order) and Barrett constant |
| MOD <= w29 <= n = dmem[p256_n]; w28 <= u_n = dmem[p256_u_n] */ |
| li x2, 29 |
| la x3, p256_n |
| bn.lid x2, 0(x3) |
| bn.wsrw 0, w29 |
| li x2, 28 |
| la x3, p256_u_n |
| bn.lid x2, 0(x3) |
| |
| /* scalar multiplication with base point |
| (x_1, y_1) = (w11, w12) <= k*G = w0*(dmem[p256_gx], dmem[p256_gy]) */ |
| la x21, p256_gx |
| la x22, p256_gy |
| jal x1, scalar_mult_int |
| |
| /* setup modulus n (curve order) and Barrett constant |
| MOD <= w29 <= n = dmem[p256_n]; w28 <= u_n = dmem[p256_u_n] */ |
| li x2, 29 |
| la x3, p256_n |
| bn.lid x2, 0(x3) |
| bn.wsrw 0, w29 |
| li x2, 28 |
| la x3, p256_u_n |
| bn.lid x2, 0(x3) |
| |
| /* re-load first share of secret scalar k from dmem: w0,w1 = dmem[k0] */ |
| la x16, k0 |
| li x2, 0 |
| bn.lid x2, 0(x16++) |
| li x2, 1 |
| bn.lid x2, 0(x16) |
| |
| /* re-load second share of secret scalar k from dmem: w2,w3 = dmem[k1] */ |
| la x16, k1 |
| li x2, 2 |
| bn.lid x2, 0(x16++) |
| li x2, 3 |
| bn.lid x2, 0(x16) |
| |
| /* Generate a random 127-bit number. |
| w4 <= URND()[255:129] */ |
| bn.wsrr w4, 0x2 /* URND */ |
| bn.rshi w4, w31, w4 >> 129 |
| |
| /* Add 1 to get a 128-bit nonzero scalar for masking. |
| w4 <= w4 + 1 = alpha */ |
| bn.addi w4, w4, 1 |
| |
| /* w0 <= ([w0,w1] * w4) mod n = (k0 * alpha) mod n */ |
| bn.mov w24, w0 |
| bn.mov w25, w1 |
| bn.mov w26, w4 |
| jal x1, mod_mul_320x128 |
| bn.mov w0, w19 |
| |
| /* w19 <= ([w2,w3] * w26) mod n = (k1 * alpha) mod n */ |
| bn.mov w24, w2 |
| bn.mov w25, w3 |
| jal x1, mod_mul_320x128 |
| |
| /* w0 <= (w0+w19) mod n = (k * alpha) mod n */ |
| bn.addm w0, w0, w19 |
| |
| /* w1 <= w0^-1 mod n = (k * alpha)^-1 mod n */ |
| jal x1, mod_inv |
| |
| /* Load first share of secret key d from dmem. |
| w2,w3 = dmem[d0] */ |
| la x16, d0 |
| li x2, 2 |
| bn.lid x2, 0(x16++) |
| li x2, 3 |
| bn.lid x2, 0(x16) |
| |
| /* Load second share of secret key d from dmem. |
| w5,w6 = dmem[d1] */ |
| la x16, d1 |
| li x2, 5 |
| bn.lid x2, 0(x16++) |
| li x2, 6 |
| bn.lid x2, 0(x16) |
| |
| /* w0 <= ([w2,w3] * w4) mod n = (d0 * alpha) mod n */ |
| bn.mov w24, w2 |
| bn.mov w25, w3 |
| bn.mov w26, w4 |
| jal x1, mod_mul_320x128 |
| bn.mov w0, w19 |
| |
| /* w19 <= ([w5,w6] * w4) mod n = (d1 * alpha) mod n */ |
| bn.mov w24, w5 |
| bn.mov w25, w6 |
| bn.mov w26, w4 |
| jal x1, mod_mul_320x128 |
| |
| /* w0 <= (w0+w19) mod n = (d * alpha) mod n */ |
| bn.addm w0, w0, w19 |
| |
| /* Compare to 0. |
| FG0.Z <= (w0 =? w31) = ((d * alpha) mod n =? 0) */ |
| bn.cmp w0, w31 |
| |
| /* Trigger a fault if FG0.Z is set, aborting the computation. |
| |
| Since alpha is nonzero mod n, (d * alpha) mod n = 0 means d is zero mod n, |
| which violates ECDSA private key requirements. This could technically be |
| triggered by an unlucky key manager seed, but the probability is so low (~1/n) |
| that it more likely indicates a fault attack. */ |
| jal x1, trigger_fault_if_fg0_z |
| |
| /* w24 = r <= w11 mod n */ |
| bn.addm w24, w11, w31 |
| |
| /* Store r of signature in dmem. |
| dmem[r] <= r = w24 */ |
| la x19, r |
| li x2, 24 |
| bn.sid x2, 0(x19) |
| |
| /* w19 <= (w24 * w0) mod n = (r * d * alpha) mod n */ |
| bn.mov w25, w0 |
| jal x1, mod_mul_256x256 |
| |
| /* w0 <= (w1 * w19) mod n = ((k * alpha)^-1 * (r * d * alpha)) mod n |
| = (k^-1 * r * d) mod n */ |
| bn.mov w24, w1 |
| bn.mov w25, w19 |
| jal x1, mod_mul_256x256 |
| bn.mov w0, w19 |
| |
| /* Load message from dmem: |
| w24 = msg <= dmem[msg] */ |
| la x18, msg |
| li x2, 24 |
| bn.lid x2, 0(x18) |
| |
| /* w19 = (w24 * w4) mod n = <= (msg * alpha) mod n */ |
| bn.mov w25, w4 |
| jal x1, mod_mul_256x256 |
| |
| /* w19 = (w1 * w19) mod n = ((k * alpha)^-1 * (msg * alpha)) mod n |
| = (k^-1 * msg) mod n */ |
| bn.mov w24, w1 |
| bn.mov w25, w19 |
| jal x1, mod_mul_256x256 |
| |
| /* w0 = (w0 + w19) mod n = (k^-1*r*d + k^-1*msg) mod n = s */ |
| bn.addm w0, w0, w19 |
| |
| /* Store s of signature in dmem. |
| dmem[s] <= s = w0 */ |
| la x20, s |
| li x2, 0 |
| bn.sid x2, 0(x20) |
| |
| ret |
| |
| |
| /** |
| * P-256 scalar multiplication with base point G |
| * |
| * returns R = d*G = d*(x_g, y_g) |
| * with R and G being valid P-256 curve points in affine coordinates, |
| * furthermore G being the curves base point, |
| * d being a 256 bit scalar |
| * |
| * Performs a scalar multiplication of a scalar with the base point G of curve |
| * P-256. |
| * |
| * This routine assumes that the scalar d is provided in two shares, d0 and d1, |
| * where: |
| * d = (d0 + d1) mod n |
| * |
| * The shares d0 and d1 are up to 320 bits each to provide extra redundancy. |
| * |
| * This routine runs in constant time. |
| * |
| * @param[in] dmem[d0]: first share of scalar d (320 bits) |
| * @param[in] dmem[d1]: second share of scalar d (320 bits) |
| * @param[out] dmem[x]: affine x-coordinate (256 bits) |
| * @param[out] dmem[y]: affine y-coordinate (256 bits) |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * clobbered registers: x2, x3, x16, x17, x21, x22, w0 to w26 |
| * clobbered flag groups: FG0 |
| */ |
| p256_base_mult: |
| |
| /* init all-zero register */ |
| bn.xor w31, w31, w31 |
| |
| /* Load first share of secret key d from dmem. |
| w0,w1 = dmem[d0] */ |
| la x16, d0 |
| li x2, 0 |
| bn.lid x2, 0(x16++) |
| li x2, 1 |
| bn.lid x2, 0(x16) |
| |
| /* Load second share of secret key d from dmem. |
| w2,w3 = dmem[d1] */ |
| la x16, d1 |
| li x2, 2 |
| bn.lid x2, 0(x16++) |
| li x2, 3 |
| bn.lid x2, 0(x16) |
| |
| /* call internal scalar multiplication routine |
| R = (x_a, y_a) = (w11, w12) <= d*P = (w0 + w1)*P */ |
| la x21, p256_gx |
| la x22, p256_gy |
| jal x1, scalar_mult_int |
| |
| /* store result (affine coordinates) in dmem |
| dmem[x] <= x_a = w11 |
| dmem[y] <= y_a = w12 */ |
| li x2, 11 |
| la x21, x |
| bn.sid x2++, 0(x21) |
| la x22, y |
| bn.sid x2, 0(x22) |
| |
| ret |
| |
| |
| /** |
| * Variable time modular multiplicative inverse computation |
| * |
| * Returns c <= a^(-1) mod m |
| * with a being a bigint of length 256 bit with a < m |
| * m being the modulus with a length of 256 bit |
| * c being a 256-bit result |
| * |
| * This routine implements the computation of the modular multiplicative |
| * inverse based on the binary GCD or Stein's algorithm. |
| * The implemented variant is based on the |
| * "right-shift binary extended GCD" as it is described in section 3.1 of [1] |
| * (Algorithm 1). |
| * [1] https://doi.org/10.1155/ES/2006/32192 |
| * |
| * Note that this is a variable time implementation. I.e. this routine will |
| * show a data dependent timing and execution profile. Only use in situations |
| * where a full white-box environment is acceptable. |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * @param[in] w0: a, operand |
| * @param[in] MOD: m, modulus |
| * @param[in] w31: all-zero |
| * @param[out] w1: result c |
| * |
| * clobbered registers: x2, w2, w3, w4, w7 |
| * clobbered flag groups: FG0 |
| */ |
| mod_inv_var: |
| |
| /* w2 = r = 0 */ |
| bn.mov w2, w31 |
| |
| /* w3 = s = 1 */ |
| bn.addi w3, w31, 1 |
| |
| /* w4 = u = MOD */ |
| bn.wsrr w4, 0 |
| bn.wsrr w7, 0 |
| |
| /* w5 = v = w0 */ |
| bn.mov w5, w0 |
| |
| ebgcd_loop: |
| /* test if u is odd */ |
| bn.or w4, w4, w4 |
| csrrs x2, 0x7c0, x0 |
| andi x2, x2, 4 |
| bne x2, x0, ebgcd_u_odd |
| |
| /* u is even: */ |
| /* w4 = u <= u/2 = w4 >> 1 */ |
| bn.rshi w4, w31, w4 >> 1 |
| |
| /* test if r is odd */ |
| bn.or w2, w2, w2 |
| csrrs x2, 0x7c0, x0 |
| andi x2, x2, 4 |
| bne x2, x0, ebgcd_r_odd |
| |
| /* r is even: */ |
| /* w2 = r <= r/2 = w2 >> 1 */ |
| bn.rshi w2, w31, w2 >> 1 |
| jal x0, ebgcd_loop |
| |
| ebgcd_r_odd: |
| /* w2 = r <= (r + m)/2 = (w2 + w7) >> 1 */ |
| bn.add w2, w7, w2 |
| bn.addc w6, w31, w31 |
| bn.rshi w2, w6, w2 >> 1 |
| jal x0, ebgcd_loop |
| |
| ebgcd_u_odd: |
| /* test if v is odd */ |
| bn.or w5, w5, w5 |
| csrrs x2, 0x7c0, x0 |
| andi x2, x2, 4 |
| bne x2, x0, ebgcd_uv_odd |
| |
| /* v is even: */ |
| /* w5 = v <= v/2 = w5 >> 1 */ |
| bn.rshi w5, w31, w5 >> 1 |
| |
| /* test if s is odd */ |
| bn.or w3, w3, w3 |
| csrrs x2, 0x7c0, x0 |
| andi x2, x2, 4 |
| bne x2, x0, ebgcd_s_odd |
| |
| /* s is even: */ |
| /* w3 = s <= s/2 = w3 >> 1 */ |
| bn.rshi w3, w31, w3 >> 1 |
| jal x0, ebgcd_loop |
| |
| ebgcd_s_odd: |
| /* w3 = s <= (s + m)/2 = (w3 + w7) >> 1 */ |
| bn.add w3, w7, w3 |
| bn.addc w6, w31, w31 |
| bn.rshi w3, w6, w3 >> 1 |
| jal x0, ebgcd_loop |
| |
| ebgcd_uv_odd: |
| /* test if v >= u */ |
| bn.cmp w5, w4 |
| csrrs x2, 0x7c0, x0 |
| andi x2, x2, 1 |
| beq x2, x0, ebgcd_v_gte_u |
| |
| /* u > v: */ |
| /* w2 = r <= r - s = w2 - w3; if (r < 0): r <= r + m */ |
| bn.subm w2, w2, w3 |
| |
| /* w4 = u <= u - v = w4 - w5 */ |
| bn.sub w4, w4, w5 |
| jal x0, ebgcd_loop |
| |
| ebgcd_v_gte_u: |
| /* w3 = s <= s - r = w3 - w2; if (s < 0) s <= s + m */ |
| bn.subm w3, w3, w2 |
| |
| /* w5 = v <= v - u = w5 - w4 */ |
| bn.sub w5, w5, w4 |
| |
| /* if v > 0 go back to start of loop */ |
| csrrs x2, 0x7c0, x0 |
| andi x2, x2, 8 |
| beq x2, x0, ebgcd_loop |
| |
| /* v <= 0: */ |
| /* if (r > m): w1 = a = r - m = w2 - MOD else: w1 = a = r = w2 */ |
| bn.addm w1, w2, w31 |
| |
| ret |
| |
| |
| /** |
| * P-256 ECDSA signature verification |
| * |
| * returns the affine x-coordinate of |
| * (x1, y1) = u1*G + u2*Q |
| * with u1 = z*s^-1 mod n and u2 = r*s^-1 mod n |
| * with G being the curve's base point, |
| * z being the message |
| * r, s being the signature |
| * Q being the public key. |
| * |
| * The routine computes the x1 coordinate and places it in dmem. x1 will be |
| * reduced (mod n), however, the final comparison has to be performed on the |
| * host side. The signature is valid if x1 == r. |
| * This routine runs in variable time. |
| * |
| * @param[in] dmem[msg]: message to be verified (256 bits) |
| * @param[in] dmem[r]: r component of signature (256 bits) |
| * @param[in] dmem[s]: s component of signature (256 bits) |
| * @param[in] dmem[x]: affine x-coordinate of public key (256 bits) |
| * @param[in] dmem[y]: affine y-coordinate of public key (256 bits) |
| * @param[out] dmem[x_r]: dmem buffer for reduced affine x_r-coordinate (x_1) |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * clobbered registers: x2, x3, x13, x14, x17 to x24, w0 to w25 |
| * clobbered flag groups: FG0 |
| */ |
| p256_verify: |
| |
| /* init all-zero register */ |
| bn.xor w31, w31, w31 |
| |
| /* load domain parameter b from dmem |
| w27 <= b = dmem[p256_b] */ |
| li x2, 27 |
| la x3, p256_b |
| bn.lid x2, 0(x3) |
| |
| /* load r of signature from dmem: w24 = r = dmem[r] */ |
| la x19, r |
| li x2, 11 |
| bn.lid x2, 0(x19) |
| |
| /* setup modulus n (curve order) and Barrett constant |
| MOD <= w29 <= n = dmem[p256_n]; w28 <= u_n = dmem[p256_u_n] */ |
| li x2, 29 |
| la x3, p256_n |
| bn.lid x2, 0(x3) |
| bn.wsrw 0, w29 |
| li x2, 28 |
| la x3, p256_u_n |
| bn.lid x2, 0(x3) |
| |
| /* load s of signature from dmem: w0 = s = dmem[s] */ |
| la x20, s |
| bn.lid x0, 0(x20) |
| |
| /* goto 'fail' if w0 == w31 <=> s == 0 */ |
| bn.cmp w0, w31 |
| csrrs x2, 0x7c0, x0 |
| andi x2, x2, 8 |
| bne x2, x0, fail |
| |
| /* goto 'fail' if w0 >= w29 <=> s >= n */ |
| bn.cmp w0, w29 |
| csrrs x2, 0x7c0, x0 |
| andi x2, x2, 1 |
| beq x2, x0, fail |
| |
| /* w1 = s^-1 mod n */ |
| jal x1, mod_inv_var |
| |
| /* load r of signature from dmem: w24 = r = dmem[r] */ |
| la x19, r |
| li x2, 24 |
| bn.lid x2, 0(x19) |
| |
| /* goto 'fail' if w24 == w31 <=> r == 0 */ |
| bn.cmp w24, w31 |
| csrrs x2, 0x7c0, x0 |
| andi x2, x2, 8 |
| bne x2, x0, fail |
| |
| /* goto 'fail' if w0 >= w29 <=> r >= n */ |
| bn.cmp w24, w29 |
| csrrs x2, 0x7c0, x0 |
| andi x2, x2, 1 |
| beq x2, x0, fail |
| |
| /* w25 = s^-1 = w1 */ |
| bn.mov w25, w1 |
| |
| /* u2 = w0 = w19 <= w24*w25 = r*s^-1 mod n */ |
| jal x1, mod_mul_256x256 |
| bn.mov w0, w19 |
| |
| /* load message, w24 = msg = dmem[msg] */ |
| la x18, msg |
| li x2, 24 |
| bn.lid x2, 0(x18) |
| |
| /* u1 = w1 = w19 <= w24*w25 = w24*w1 = msg*s^-1 mod n */ |
| bn.mov w25, w1 |
| jal x1, mod_mul_256x256 |
| bn.mov w1, w19 |
| |
| /* setup modulus p and Barrett constant */ |
| li x2, 29 |
| la x3, p256_p |
| bn.lid x2, 0(x3) |
| bn.wsrw 0, w29 |
| li x2, 28 |
| la x3, p256_u_p |
| bn.lid x2, 0(x3) |
| |
| /* load public key Q from dmem and use in projective form (set z to 1) |
| Q = (w11, w12, w13) = (dmem[x], dmem[y], 1) */ |
| li x2, 11 |
| la x21, x |
| bn.lid x2++, 0(x21) |
| la x22, y |
| bn.lid x2, 0(x22) |
| bn.addi w13, w31, 1 |
| |
| /* load base point G and use in projective form (set z to 1) |
| G = (w8, w9, w10) = (x_g, y_g, 1) */ |
| li x13, 8 |
| la x23, p256_gx |
| bn.lid x13, 0(x23) |
| li x14, 9 |
| la x24, p256_gy |
| bn.lid x14, 0(x24) |
| bn.addi w10, w31, 1 |
| |
| /* The rest of the routine implements a variable time double-and-add |
| algorithm. For the signature verification we need to compute the point |
| C = (x1, y1) = u_1*G + u_2*Q. This can be done in a single |
| double-and-add routine by using Shamir's Trick. */ |
| |
| /* G+Q = (w3,w4,w5) = (w11,w12,w13) = (w8,w9,w10) (+) (w11,w12,w13) */ |
| jal x1, proj_add |
| bn.mov w3, w11 |
| bn.mov w4, w12 |
| bn.mov w5, w13 |
| |
| /* w2 = u_2 & u_0 = w0 & w1*/ |
| bn.and w2, w0, w1 |
| |
| /* init double and add algorithm with (0, 1, 0) */ |
| bn.mov w11, w31 |
| bn.addi w12, w31, 1 |
| bn.mov w13, w31 |
| |
| /* main loop with dicreasing index i (i=255 downto 0) */ |
| loopi 256, 31 |
| |
| /* always double: C = (w11,w12,w13) <= 2 (*) C = 2 (*) (w11,w12,w13) */ |
| bn.mov w8, w11 |
| bn.mov w9, w12 |
| bn.mov w10, w13 |
| jal x1, proj_add |
| |
| /* if either u_1[i] == 0 or u_2[i] == 0 jump to 'no_both' */ |
| bn.add w2, w2, w2 |
| csrrs x2, 0x7c0, x0 |
| andi x2, x2, 1 |
| beq x2, x0, no_both |
| |
| /* both bits at current index (u1[i] and u2[i]) are set: |
| do C <= C + (P + Q) and jump to end */ |
| bn.mov w8, w3 |
| bn.mov w9, w4 |
| bn.mov w10, w5 |
| jal x1, proj_add |
| jal x0, no_q |
| |
| /* either u1[i] or u2[i] is set, but not both */ |
| no_both: |
| |
| /* if u2[i] is not set jump to 'no_g' */ |
| bn.add w6, w0, w0 |
| csrrs x2, 0x7c0, x0 |
| andi x2, x2, 1 |
| beq x2, x0, no_g |
| |
| /* u2[i] is set: do C <= C + Q */ |
| bn.lid x13, 0(x21) |
| bn.lid x14, 0(x22) |
| bn.addi w10, w31, 1 |
| jal x1, proj_add |
| |
| no_g: |
| /* if u1[i] is not set jump to 'no_q' */ |
| bn.add w6, w1, w1 |
| csrrs x2, 0x7c0, x0 |
| andi x2, x2, 1 |
| beq x2, x0, no_q |
| |
| /* load base point x-coordinate |
| w8 <= g_x = dmem [p256_gx]; w9 <= g_y = dmem[p256_gy] */ |
| bn.lid x13, 0(x23) |
| bn.lid x14, 0(x24) |
| |
| /* u1[i] is set: do C <= C + G */ |
| bn.addi w10, w31, 1 |
| jal x1, proj_add |
| |
| no_q: |
| /* left shift w0 and w1 to decrease index */ |
| bn.add w0, w0, w0 |
| bn.add w1, w1, w1 |
| |
| /* compute inverse of z-coordinate: w1 = z_c^-1 mod p */ |
| bn.mov w0, w13 |
| jal x1, mod_inv_var |
| |
| /* convert x-coordinate of C back to affine: x1 = x_c * z_c^-1 mod p */ |
| bn.mov w24, w1 |
| bn.mov w25, w11 |
| jal x1, mod_mul_256x256 |
| |
| /* final reduction: w24 = x1 <= x1 mod n */ |
| la x3, p256_n |
| bn.lid x0, 0(x3) |
| bn.wsrw 0, w0 |
| bn.subm w24, w19, w31 |
| |
| fail: |
| /* store affine x-coordinate in dmem: dmem[x_r] = w24 = x_r */ |
| la x17, x_r |
| li x2, 24 |
| bn.sid x2, 0(x17) |
| |
| ret |
| |
| |
| /** |
| * Externally callable wrapper for P-256 scalar point multiplication |
| * |
| * returns R = k*P = k*(x_p, y_p, z_p) |
| * with R, P being valid P-256 curve points in projective form, |
| * k being a 256 bit scalar. |
| * |
| * This routine assumes that the scalar k is provided in two shares, k0 and k1, |
| * where: |
| * k = (k0 + k1) mod n |
| * |
| * Sets up context and calls internal scalar multiplication routine. |
| * This routine runs in constant time. |
| * |
| * @param[in] dmem[k0]: first share of scalar k (256 bits) |
| * @param[in] dmem[k1]: second share of scalar k (256 bits) |
| * @param[in,out] dmem[x]: affine x-coordinate in dmem |
| * @param[in,out] dmem[y]: affine y-coordinate in dmem |
| * |
| * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on |
| * the computed affine y-coordinate. |
| * |
| * clobbered registers: x2, x3, x16, x17, x21, x22, w0 to w25 |
| * clobbered flag groups: FG0 |
| */ |
| p256_scalar_mult: |
| |
| /* init all-zero register */ |
| bn.xor w31, w31, w31 |
| |
| /* load first share of scalar from dmem: w0 = dmem[k0] */ |
| la x16, k0 |
| li x2, 0 |
| bn.lid x2, 0(x16) |
| |
| /* load second share of scalar from dmem: w1 = dmem[k1] */ |
| la x16, k1 |
| li x2, 1 |
| bn.lid x2, 0(x16) |
| |
| /* call internal scalar multiplication routine |
| R = (x_a, y_a) = (w11, w12) <= k*P = w0*P */ |
| la x21, x |
| la x22, y |
| jal x1, scalar_mult_int |
| |
| /* store result (affine coordinates) in dmem |
| dmem[x] <= x_a = w11 |
| dmem[y] <= y_a = w12 */ |
| li x2, 11 |
| bn.sid x2++, 0(x21) |
| bn.sid x2, 0(x22) |
| |
| ret |
| |
| /** |
| * Generate a nonzero random value in the scalar field. |
| * |
| * Returns t, a random value that is nonzero mod n, in shares. |
| * |
| * This follows a modified version of the method in FIPS 186-4 sections B.4.1 |
| * and B.5.1 for generation of secret scalar values d and k. The computation |
| * in FIPS 186-4 is: |
| * seed = RBG(seedlen) // seedlen >= 320 |
| * return (seed mod (n-1)) + 1 |
| * |
| * The important features here are that (a) the seed is at least 64 bits longer |
| * than n in order to minimize bias after the reduction and (b) the resulting |
| * scalar is guaranteed to be nonzero. |
| * |
| * We deviate from FIPS a little bit here because for side-channel protection, |
| * we do not want to fully reduce the seed modulo (n-1) or combine the shares. |
| * Instead, we do the following: |
| * seed0 = RBG(320) |
| * seed1 = RBG(320) |
| * x = URND(127) + 1 // random value for masking |
| * if (seed0 * x + seed1 * x) mod n == 0: |
| * retry |
| * return seed0, seed1 |
| * |
| * Essentially, we get two independent seeds and interpret these as additive |
| * shares of the scalar t = (seed0 + seed1) mod n. Then, we need to ensure t is |
| * nonzero. Multiplying each share with a random masking parameter allows us to |
| * safely add them, and then check if this result is 0; if it is, then t must |
| * be 0 mod n and we need to retry. |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * @param[in] w31: all-zero |
| * @param[out] w15,w16: first share of secret scalar t (320 bits) |
| * @param[out] w17,w18: second share of secret scalar t (320 bits) |
| * |
| * clobbered registers: x2, x3, x20, w12 to w29 |
| * clobbered flag groups: FG0 |
| */ |
| p256_random_scalar: |
| /* Load the curve order n. |
| w29 <= dmem[p256_n] = n */ |
| li x2, 29 |
| la x3, p256_n |
| bn.lid x2, 0(x3) |
| |
| /* Copy n into the MOD register. */ |
| bn.wsrw 0, w29 |
| |
| /* Load Barrett constant for n. |
| w28 <= u_n = dmem[p256_u_n] */ |
| li x2, 28 |
| la x3, p256_u_n |
| bn.lid x2, 0(x3) |
| |
| random_scalar_retry: |
| /* Obtain 768 bits of randomness from RND. */ |
| bn.wsrr w15, 0x1 /* RND */ |
| bn.wsrr w16, 0x1 /* RND */ |
| bn.wsrr w17, 0x1 /* RND */ |
| |
| /* XOR with bits from URND, just in case there's any vulnerability in EDN |
| that lets the attacker recover bits before they reach OTBN. */ |
| bn.wsrr w20, 0x2 /* URND */ |
| bn.xor w16, w16, w20 |
| bn.wsrr w20, 0x2 /* URND */ |
| bn.xor w17, w17, w20 |
| bn.wsrr w20, 0x2 /* URND */ |
| bn.xor w18, w18, w20 |
| |
| /* Shift bits to get 320-bit seeds. |
| w18 <= w16[255:192] |
| w16 <= w16[63:0] */ |
| bn.rshi w18, w31, w16 >> 192 |
| bn.rshi w20, w16, w31 >> 64 |
| bn.rshi w16, w20, w31 >> 192 |
| |
| /* Generate a random masking parameter. |
| w14 <= URND(127) + 1 = x */ |
| bn.wsrr w14, 0x2 /* URND */ |
| bn.addi w14, w14, 1 |
| |
| /* w12 <= ([w15,w16] * w14) mod n = (seed0 * x) mod n */ |
| bn.mov w24, w15 |
| bn.mov w25, w16 |
| bn.mov w26, w14 |
| jal x1, mod_mul_320x128 |
| bn.mov w12, w19 |
| |
| /* w13 <= ([w17,w18] * w14) mod n = (seed1 * x) mod n */ |
| bn.mov w24, w17 |
| bn.mov w25, w18 |
| bn.mov w26, w14 |
| jal x1, mod_mul_320x128 |
| bn.mov w13, w19 |
| |
| /* w12 <= (w12 + w13) mod n = ((seed0 + seed1) * x) mod n */ |
| bn.addm w12, w12, w13 |
| |
| /* Compare to 0. |
| FG0.Z <= (w12 =? w31) = ((seed0 + seed1) mod n =? 0) */ |
| bn.cmp w12, w31 |
| |
| /* Read the FG0.Z flag (position 3). |
| x2 <= 8 if FG0.Z else 0 */ |
| csrrw x2, 0x7c0, x0 |
| andi x2, x2, 8 |
| |
| /* Retry if x2 != 0. */ |
| bne x2, x0, random_scalar_retry |
| |
| /* If we get here, then (seed0 + seed1) mod n is nonzero mod n; return. */ |
| |
| ret |
| |
| /** |
| * Generate the secret key d from a random seed. |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * @param[out] dmem[d0]: first share of private key d |
| * @param[out] dmem[d1]: second share of private key d |
| * |
| * clobbered registers: x2, x3, x20, w20, w21, w29 |
| * clobbered flag groups: FG0 |
| */ |
| p256_generate_random_key: |
| /* Init all-zero register. */ |
| bn.xor w31, w31, w31 |
| |
| /* Generate a random scalar in two 320-bit shares. |
| w15, w16 <= d0 |
| w17, w18 <= d1 */ |
| jal x1, p256_random_scalar |
| |
| /* Write first share to DMEM. |
| dmem[d0] <= w15, w16 = d0 */ |
| la x20, d0 |
| li x2, 15 |
| bn.sid x2, 0(x20++) |
| li x2, 16 |
| bn.sid x2, 0(x20) |
| |
| /* Write second share to DMEM. |
| dmem[d1] <= w15, w16 = d0 */ |
| la x20, d1 |
| li x2, 17 |
| bn.sid x2, 0(x20++) |
| li x2, 18 |
| bn.sid x2, 0(x20) |
| |
| ret |
| |
| /** |
| * Generate the secret scalar k from a random seed. |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * @param[out] dmem[k0]: first share of secret scalar k |
| * @param[out] dmem[k1]: second share of secret scalar k |
| * |
| * clobbered registers: x2, x3, x20, w20, w21, w29 |
| * clobbered flag groups: FG0 |
| */ |
| p256_generate_k: |
| /* Init all-zero register. */ |
| bn.xor w31, w31, w31 |
| |
| /* Generate a random scalar in two 320-bit shares. |
| w15, w16 <= k0 |
| w17, w18 <= k1 */ |
| jal x1, p256_random_scalar |
| |
| /* Write first share to DMEM. |
| dmem[k0] <= w15, w16 = k0 */ |
| la x20, k0 |
| li x2, 15 |
| bn.sid x2, 0(x20++) |
| li x2, 16 |
| bn.sid x2, 0(x20) |
| |
| /* Write second share to DMEM. |
| dmem[k1] <= w15, w16 = k0 */ |
| la x20, k1 |
| li x2, 17 |
| bn.sid x2, 0(x20++) |
| li x2, 18 |
| bn.sid x2, 0(x20) |
| |
| ret |
| |
| /** |
| * Convert boolean shares to arithmetic ones using Goubin's algorithm. |
| * |
| * Returns x0, x1 such that (s0 ^ s1) = (x0 + x1) mod 2^321. |
| * |
| * The input consists of two 320-bit shares, s0 and s1. Bits at position 320 |
| * and above in the input shares will be ignored. We compute the result mod |
| * 2^321 so that the high bit of x0 will reveal the carry modulo 2^320. |
| * |
| * We then use Goubin's boolean-to-arithmetic masking algorithm to switch from |
| * this boolean masking scheme to an arithmetic one without ever unmasking the |
| * seed. See Algorithm 1 here: |
| * https://link.springer.com/content/pdf/10.1007/3-540-44709-1_2.pdf |
| * |
| * The algorithm is reproduced here for reference: |
| * Input: |
| * s0, s1: k-bit shares such that x = s0 ^ s1 |
| * gamma: random k-bit number |
| * Output: x0, k-bit number such that x = (x0 + s1) mod 2^k |
| * Pseudocode: |
| * T := ((s0 ^ gamma) - gamma) mod 2^k |
| * T2 := T ^ s0 |
| * G := gamma ^ s1 |
| * A := ((s0 ^ G) - G) mod 2^k |
| * return x0 := (A ^ T2) |
| * |
| * The output x1 is always (s1 mod 2^320). |
| * |
| * This routine runs in constant time. |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * @param[in] [w21, w20]: s0, first share of seed (320 bits) |
| * @param[in] [w23, w22]: s1, second share of seed (320 bits) |
| * @param[in] w31: all-zero |
| * @param[out] [w21, w20]: result x0 (321 bits) |
| * @param[out] [w23, w22]: result x1 (320 bits) |
| * |
| * clobbered registers: w1 to w5, w20 to w23 |
| * clobbered flag groups: FG0 |
| */ |
| boolean_to_arithmetic: |
| /* Mask out excess bits from seed shares. |
| [w21, w20] <= s0 mod 2^320 |
| [w23, w22] <= s1 mod 2^320 = x1 */ |
| bn.rshi w21, w21, w31 >> 64 |
| bn.rshi w21, w31, w21 >> 192 |
| bn.rshi w23, w23, w31 >> 64 |
| bn.rshi w23, w31, w23 >> 192 |
| |
| /* Fetch 321 bits of randomness from URND. |
| [w2, w1] <= gamma */ |
| bn.wsrr w1, 2 |
| bn.wsrr w2, 2 |
| bn.rshi w2, w31, w2 >> 191 |
| |
| /* [w4, w3] <= [w21, w20] ^ [w2, w1] = s0 ^ gamma */ |
| bn.xor w3, w20, w1 |
| bn.xor w4, w21, w2 |
| |
| /* Subtract gamma. This may result in bits above 2^321, but these will be |
| stripped off in the next step. |
| [w4, w3] <= [w4, w3] - [w2, w1] = ((s0 ^ gamma) - gamma) mod 2^512 */ |
| bn.sub w3, w3, w1 |
| bn.subb w4, w4, w2 |
| |
| /* Truncate subtraction result to 321 bits. |
| [w4, w3] <= [w4, w3] mod 2^321 = T */ |
| bn.rshi w4, w4, w31 >> 65 |
| bn.rshi w4, w31, w4 >> 191 |
| |
| /* [w4, w3] <= [w4, w3] ^ [w21, w20] = T2 */ |
| bn.xor w3, w3, w20 |
| bn.xor w4, w4, w21 |
| |
| /* [w2, w1] <= [w2, w1] ^ [w23, w22] = gamma ^ s1 = G */ |
| bn.xor w1, w1, w22 |
| bn.xor w2, w2, w23 |
| |
| /* [w21, w20] <= [w21, w20] ^ [w2, w1] = s0 ^ G */ |
| bn.xor w20, w20, w1 |
| bn.xor w21, w21, w2 |
| |
| /* [w21, w20] <= [w21, w20] - [w2, w1] = ((s0 ^ G) - G) mod 2^512 */ |
| bn.sub w20, w20, w1 |
| bn.subb w21, w21, w2 |
| |
| /* [w21, w20] <= [w21, w20] mod 2^321 = A */ |
| bn.rshi w21, w21, w31 >> 65 |
| bn.rshi w21, w31, w21 >> 191 |
| |
| /* [w21, w20] <= [w21, w20] ^ [w4, w3] = A ^ T2 = x0 */ |
| bn.xor w20, w20, w3 |
| bn.xor w21, w21, w4 |
| |
| ret |
| |
| /** |
| * P-256 ECDSA secret key generation. |
| * |
| * Returns the secret key d in two 320-bit shares d0 and d1, such that: |
| * d = (d0 + d1) mod n |
| * ...where n is the curve order. |
| * |
| * This implementation follows FIPS 186-4 section B.4.1, where we |
| * generate d using N+64 random bits (320 bits in this case) as a seed. But |
| * while FIPS computes d = (seed mod (n-1)) + 1 to ensure a nonzero key, we |
| * instead just compute d = seed mod n. The caller MUST ensure that if this |
| * routine is used, then other routines that use d (e.g. signing, public key |
| * generation) are checking if d is 0. |
| * |
| * Most complexity in this routine comes from masking. The input seed is |
| * provided in two 320-bit shares, seed0 and seed1, such that: |
| * seed = seed0 ^ seed1 |
| * Bits at position 320 and above in the input shares will be ignored. |
| * |
| * We then use Goubin's boolean-to-arithmetic masking algorithm to switch from |
| * this boolean masking scheme to an arithmetic one without ever unmasking the |
| * seed. See Algorithm 1 here: |
| * https://link.springer.com/content/pdf/10.1007/3-540-44709-1_2.pdf |
| * |
| * For a Coq proof of the correctness of the basic computational logic here |
| * see: |
| * https://gist.github.com/jadephilipoom/24f44c59cbe59327e2f753867564fa28#file-masked_reduce-v-L226 |
| * |
| * The proof does not cover leakage properties; it mostly just shows that this |
| * logic correctly computes (seed mod n) and the carry-handling works. |
| * |
| * This routine runs in constant time. |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * @param[in] [w21, w20]: seed0, first share of seed (320 bits) |
| * @param[in] [w23, w22]: seed1, second share of seed (320 bits) |
| * @param[in] w31: all-zero |
| * @param[out] [w21, w20]: d0, first share of private key d (320 bits) |
| * @param[out] [w23, w22]: d1, second share of private key d (320 bits) |
| * |
| * clobbered registers: x2, x3, w1 to w4, w20 to w29 |
| * clobbered flag groups: FG0 |
| */ |
| p256_key_from_seed: |
| /* Convert from a boolean to an arithmetic mask using Goubin's algorithm. |
| [w21, w20] <= ((seed0 ^ seed1) - seed1) mod 2^321 = x0 */ |
| jal x1, boolean_to_arithmetic |
| |
| /* At this point, we have arithmetic shares modulo 2^321: |
| [w21, w20] : x0 |
| [w23, w22] : x1 |
| |
| We know that x1=seed1, and seed and x1 are at most 320 bits. Therefore, |
| the highest bit of x0 holds a carry bit modulo 2^320: |
| x0 = (seed - x1) mod 2^321 |
| x0 = (seed - x1) mod 2^320 + (if (x1 <= seed) then 0 else 2^320) |
| |
| The carry bit then allows us to replace (mod 2^321) with a conditional |
| statement: |
| seed = (x0 mod 2^320) + x1 - (x0[320] << 320) |
| |
| Note that the carry bit is not very sensitive from a side channel |
| perspective; x1 <= seed has some bias related to the highest bit of the |
| seed, but since the seed is 64 bits larger than n, this single-bit noisy |
| leakage should not be significant. |
| |
| From here, we want to convert to shares modulo (n * 2^64) -- these shares |
| will be equivalent to the seed modulo n but still retain 64 bits of extra |
| masking. We compute the new shares as follows: |
| c = (x0[320] << 320) mod (n << 64) |
| d0 = ((x0 mod 2^320) - c) mod (n << 64)) |
| d1 = x1 mod (n << 64) |
| |
| d = seed mod n = (d0 + d1) mod n |
| */ |
| |
| /* Load curve order n from DMEM. |
| w29 <= dmem[p256_n] = n */ |
| li x2, 29 |
| la x3, p256_n |
| bn.lid x2, 0(x3) |
| |
| /* Compute (n << 64). |
| [w29,w28] <= w29 << 64 = n << 64 */ |
| bn.rshi w28, w29, w31 >> 192 |
| bn.rshi w29, w31, w29 >> 192 |
| |
| /* [w25,w24] <= (x1 - (n << 64)) mod 2^512 */ |
| bn.sub w24, w22, w28 |
| bn.subb w25, w23, w29 |
| |
| /* Compute d1. Because 2^320 < 2 * (n << 64), a conditional subtraction is |
| sufficient to reduce. Similarly to the carry bit, the conditional bit here |
| is not very sensitive because the shares are large relative to n. |
| [w23,w22] <= x1 mod (n << 64) = d1 */ |
| bn.sel w22, w22, w24, FG0.C |
| bn.sel w23, w23, w25, FG0.C |
| |
| /* Isolate the carry bit and shift it back into position. |
| w25 <= x0[320] << 64 */ |
| bn.rshi w25, w31, w21 >> 64 |
| bn.rshi w25, w25, w31 >> 192 |
| |
| /* Clear the carry bit from the original result. |
| [w21,w20] <= x0 mod 2^320 */ |
| bn.xor w21, w21, w25 |
| |
| /* Conditionally subtract (n << 64) to reduce. |
| [w21,w20] <= (x0 mod 2^320) mod (n << 64) */ |
| bn.sub w26, w20, w28 |
| bn.subb w27, w21, w29 |
| bn.sel w20, w20, w26, FG0.C |
| bn.sel w21, w21, w27, FG0.C |
| |
| /* Compute the correction factor. |
| [w25,w24] <= (x[320] << 320) mod (n << 64) = c */ |
| bn.sub w26, w31, w28 |
| bn.subb w27, w25, w29 |
| bn.sel w24, w31, w26, FG0.C |
| bn.sel w25, w25, w27, FG0.C |
| |
| /* Compute d0 with a modular subtraction. First we add (n << 64) to protect |
| against underflow, then conditionally subtract it again if needed. |
| [w21,w20] <= ([w21, w20] - [w25,w24]) mod (n << 64) = d1 */ |
| bn.add w20, w20, w28 |
| bn.addc w21, w21, w29 |
| bn.sub w20, w20, w24 |
| bn.subb w21, w21, w25 |
| bn.sub w26, w20, w28 |
| bn.subb w27, w21, w29 |
| bn.sel w20, w20, w26, FG0.C |
| bn.sel w21, w21, w27, FG0.C |
| |
| ret |
| |
| .section .data |
| |
| /* P-256 domain parameter b */ |
| .globl p256_b |
| .balign 32 |
| p256_b: |
| .word 0x27d2604b |
| .word 0x3bce3c3e |
| .word 0xcc53b0f6 |
| .word 0x651d06b0 |
| .word 0x769886bc |
| .word 0xb3ebbd55 |
| .word 0xaa3a93e7 |
| .word 0x5ac635d8 |
| |
| /* P-256 domain parameter p (modulus) */ |
| .globl p256_p |
| .balign 32 |
| p256_p: |
| .word 0xffffffff |
| .word 0xffffffff |
| .word 0xffffffff |
| .word 0x00000000 |
| .word 0x00000000 |
| .word 0x00000000 |
| .word 0x00000001 |
| .word 0xffffffff |
| |
| /* Barrett constant u for modulus p */ |
| .globl p256_u_p |
| .balign 32 |
| p256_u_p: |
| .word 0x00000003 |
| .word 0x00000000 |
| .word 0xffffffff |
| .word 0xfffffffe |
| .word 0xfffffffe |
| .word 0xfffffffe |
| .word 0xffffffff |
| .word 0x00000000 |
| |
| /* P-256 domain parameter n (order of base point) */ |
| .globl p256_n |
| .balign 32 |
| p256_n: |
| .word 0xfc632551 |
| .word 0xf3b9cac2 |
| .word 0xa7179e84 |
| .word 0xbce6faad |
| .word 0xffffffff |
| .word 0xffffffff |
| .word 0x00000000 |
| .word 0xffffffff |
| |
| /* Barrett constant u for n */ |
| .globl p256_u_n |
| .balign 32 |
| p256_u_n: |
| .word 0xeedf9bfe |
| .word 0x012ffd85 |
| .word 0xdf1a6c21 |
| .word 0x43190552 |
| .word 0xffffffff |
| .word 0xfffffffe |
| .word 0xffffffff |
| .word 0x00000000 |
| |
| /* P-256 basepoint G affine x-coordinate */ |
| .globl p256_gx |
| .balign 32 |
| p256_gx: |
| .word 0xd898c296 |
| .word 0xf4a13945 |
| .word 0x2deb33a0 |
| .word 0x77037d81 |
| .word 0x63a440f2 |
| .word 0xf8bce6e5 |
| .word 0xe12c4247 |
| .word 0x6b17d1f2 |
| |
| /* P-256 basepoint G affine y-coordinate */ |
| .globl p256_gy |
| .balign 32 |
| p256_gy: |
| .word 0x37bf51f5 |
| .word 0xcbb64068 |
| .word 0x6b315ece |
| .word 0x2bce3357 |
| .word 0x7c0f9e16 |
| .word 0x8ee7eb4a |
| .word 0xfe1a7f9b |
| .word 0x4fe342e2 |
| |
| .section .bss |
| |
| /* random scalar k (in two 320b shares) */ |
| .balign 32 |
| .weak k0 |
| k0: |
| .zero 64 |
| .balign 32 |
| .weak k1 |
| k1: |
| .zero 64 |
| |
| /* message digest */ |
| .balign 32 |
| .weak msg |
| msg: |
| .zero 32 |
| |
| /* signature R */ |
| .balign 32 |
| .weak r |
| r: |
| .zero 32 |
| |
| /* signature S */ |
| .balign 32 |
| .weak s |
| s: |
| .zero 32 |
| |
| /* public key x-coordinate */ |
| .balign 32 |
| .weak x |
| x: |
| .zero 32 |
| |
| /* public key y-coordinate */ |
| .balign 32 |
| .weak y |
| y: |
| .zero 32 |
| |
| /* private key d (in two 320b shares) */ |
| .balign 32 |
| .weak d0 |
| d_share0: |
| .zero 64 |
| .balign 32 |
| .weak d1 |
| d_share1: |
| .zero 64 |
| |
| /* verification result x_r (aka x_1) */ |
| .balign 32 |
| .weak x_r |
| x_r: |
| .zero 32 |