blob: acfdccf2429704741daa251c1af1e4dfae2ec1f7 [file] [log] [blame]
/* Copyright lowRISC contributors. */
/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
/* SPDX-License-Identifier: Apache-2.0 */
/*
* P-384 specific routines
*/
.section .text
/**
* 384-bit modular multiplication based on Barrett reduction algorithm
* optimized for the special modulus of the NIST P-384 curve.
*
* Returns c = a x b % p.
*
* Expects: two operands, modulus p and pre-calculated parameter u for barrett
* reduction (usually greek mu in literature). u is expected without the
* leading 1 at bit 384. u has to be pre-calculated as u = floor(2^768/p).
*
* This implementation mostly follows the description in the
* "Handbook of Applied Cryptography" in Algorithm 14.42.
* Differences:
* - This implementation incorporates a multiplication before the reduction.
* Therefore it expects two operands (a, b) instead of a wider integer x.
* - The computation of q2 ignores the MSbs of q1 and u to allow using
* a 384x384 bit multiplication. This is compensated later by
* individual (conditional) additions.
* - The truncations in step 2 of HAC 14.42 in the form of (... mod b^(k+1) )
* are not implemented here and the full register width is used. This
* allows to omit computation of r1 (since r1=x) and step 3 of HAC 14.42
*
* Flags: Flags when leaving this subroutine depend on a potentially discarded
* value and therefore are not usable after return.
*
* @param[in] [w11, w10]: a, first operand, max. length 384 bit, a < m.
* @param[in] [w17, w16]: b, second operand, max. length 384 bit, b < m.
* @param[in] [w13, w12]: p, modulus of P384 i.e.:
m = 2^384 - 2^128 - 2^96 + 2^32 - 1.
* @param[in] [w15, w14]: u, pre-computed Barrett constant (without u[384]/MSb
* of u which is always 1 for the allowed range but
* has to be set to 0 here).
* @param[in] w31: all-zero.
* @param[out] [w17, w16]: c, result, max. length 384 bit.
*
* Clobbered registers: w10, w11, w16, w17, w18, w19, w20, w21, w22, w23, w24
* Clobbered flag groups: FG0
*/
.globl barrett384_p384
barrett384_p384:
/* Compute the integer product of the operands x = a * b
x = [w18, w22, w21] = a * b = [w11, w10] * [w17, w16]
=> max. length x: 768 bit */
bn.mulqacc.z w10.0, w16.0, 0
bn.mulqacc w10.0, w16.1, 64
bn.mulqacc.so w21.L, w10.1, w16.0, 64
bn.mulqacc w10.0, w16.2, 0
bn.mulqacc w10.1, w16.1, 0
bn.mulqacc w10.2, w16.0, 0
bn.mulqacc w10.0, w16.3, 64
bn.mulqacc w10.1, w16.2, 64
bn.mulqacc w10.2, w16.1, 64
bn.mulqacc.so w21.U, w10.3, w16.0, 64
bn.mulqacc w10.0, w17.0, 0
bn.mulqacc w10.1, w16.3, 0
bn.mulqacc w10.2, w16.2, 0
bn.mulqacc w10.3, w16.1, 0
bn.mulqacc w11.0, w16.0, 0
bn.mulqacc w10.0, w17.1, 64
bn.mulqacc w10.1, w17.0, 64
bn.mulqacc w10.2, w16.3, 64
bn.mulqacc w10.3, w16.2, 64
bn.mulqacc w11.0, w16.1, 64
bn.mulqacc.so w22.L, w11.1, w16.0, 64
bn.mulqacc w10.1, w17.1, 0
bn.mulqacc w10.2, w17.0, 0
bn.mulqacc w10.3, w16.3, 0
bn.mulqacc w11.0, w16.2, 0
bn.mulqacc w11.1, w16.1, 0
bn.mulqacc w10.2, w17.1, 64
bn.mulqacc w10.3, w17.0, 64
bn.mulqacc w11.0, w16.3, 64
bn.mulqacc.so w22.U, w11.1, w16.2, 64
bn.mulqacc w10.3, w17.1, 0
bn.mulqacc w11.0, w17.0, 0
bn.mulqacc w11.1, w16.3, 0
bn.mulqacc w11.0, w17.1, 64
bn.mulqacc.so w18.L, w11.1, w17.0, 64
bn.mulqacc.so w18.U, w11.1, w17.1, 0
/* Store correction factor to compensate for later neglected MSb of x.
x is 768 bit wide and therefore the 383 bit right shifted version q1
(below) contains 385 bit. Bit 384 of q1 is neglected to allow using a
384x384 multiplier. For the MSb of x being set we temporary store u
(or zero) here to be used in a later constant time correction of a
multiplication with u. Note that this requires the MSb flag being carried
over from the multiplication routine. */
bn.sel w23, w14, w31, M
bn.sel w24, w15, w31, M
/* Compute q1 = x >> 383
q1 = [w11, w10] = [w18, w22, w21] >> 383 = [w18, w21] >> 127
=> max length q1: 385 bits */
bn.rshi w11, w31, w18 >> 127
bn.rshi w10, w18, w22 >> 127
/* Compute q2 = q1*u
Instead of full q2 (which would be up to 770 bits) we ignore the MSb of u
and the MSb of q1 and correct this later. This allows using a 384x384
multiplier. We use the property that u for the modulus of P384 is zero in
the bits 383 downto 129 and use a 384x192 multiplication routine.
=> max. length q2': 513 bit
q2' = q1[383:0]*u[128:0] = [w18, w17, w16] = [w11, w10] * [w15, w14] */
/* 576 = 384*192 bit multiplication kernel */
bn.mulqacc.z w10.0, w14.0, 0
bn.mulqacc w10.0, w14.1, 64
bn.mulqacc.so w16.L, w10.1, w14.0, 64
bn.mulqacc w10.0, w14.2, 0
bn.mulqacc w10.1, w14.1, 0
bn.mulqacc w10.2, w14.0, 0
bn.mulqacc w10.1, w14.2, 64
bn.mulqacc w10.2, w14.1, 64
bn.mulqacc.so w16.U, w10.3, w14.0, 64
bn.mulqacc w10.2, w14.2, 0
bn.mulqacc w10.3, w14.1, 0
bn.mulqacc w11.0, w14.0, 0
bn.mulqacc w10.3, w14.2, 64
bn.mulqacc w11.0, w14.1, 64
bn.mulqacc.so w17.L, w11.1, w14.0, 64
bn.mulqacc w11.0, w14.2, 0
bn.mulqacc w11.1, w14.1, 0
bn.mulqacc.so w17.U, w11.1, w14.2, 64
/* w14.3 is always zero here due to structure of Barrett constant */
bn.mulqacc.wo w18, w11.1, w14.3, 64
/* q3 = q2 >> 385
In this step, the compensation for the neglected MSbs of q1 and u is
carried out underway. To add them in the q2 domain, they would have to be
left shifted by 384 bit first. To directly add them we first shift q2' by
384 bit to the right, perform the additions, and shift the result another
bit to the right. The additions cannot overflow due to leading zeros
after shift.
q2'' = q2' >> 384 = [w20, w19] = [w18, w17, w16] >> 384
= [w18, w17] >> 128 */
bn.rshi w20, w31, w18 >> 128
bn.rshi w19, w18, w17 >> 128
/* Add q1. This is unconditional since MSb of u is always 1.
This cannot overflow due to leading zeros.
q2''' = q2'' + q1 = [w20, w19] = [w20, w19] + [w10, w11] */
bn.add w19, w19, w10
bn.addc w20, w20, w11
/* Conditionally add u (without leading 1) in case of MSb of x being set.
This is the "real" q2 but shifted by 384 bits to the right. This cannot
overflow due to leading zeros
q2'''' = x[767]?q2'''+u[383:0]:q2'''
= [w20, w19] + [w24, w23] = q2 >> 384 */
bn.add w19, w19, w23
bn.addc w20, w20, w24
/* finally this gives q3 by shifting the remaining bit to the right
q3 = q2 >> 385 = q2'''' >> 1 = [w11, w10] = [w20, w19] >> 1 */
bn.rshi w11, w31, w20 >> 1
bn.rshi w10, w20, w19 >> 1
/* r2 = q3*m[511:0] = [w17, w16] = ([w11, w10] * [w13, w12])[511:0]
A 384x384 bit multiplication kernel is used here, hence both q3 or p
must not be wider than 384 bit. This is always the case for p. For q3 it
is the case if a<p and b<p.
The 256 highest bits of the multiplication result are not needed,
so we do not compute them. */
bn.mulqacc.z w10.0, w12.0, 0
bn.mulqacc w10.0, w12.1, 64
bn.mulqacc.so w16.L, w10.1, w12.0, 64
bn.mulqacc w10.0, w12.2, 0
bn.mulqacc w10.1, w12.1, 0
bn.mulqacc w10.2, w12.0, 0
bn.mulqacc w10.0, w12.3, 64
bn.mulqacc w10.1, w12.2, 64
bn.mulqacc w10.2, w12.1, 64
bn.mulqacc.so w16.U, w10.3, w12.0, 64
bn.mulqacc w10.0, w13.0, 0
bn.mulqacc w10.1, w12.3, 0
bn.mulqacc w10.2, w12.2, 0
bn.mulqacc w10.3, w12.1, 0
bn.mulqacc w11.0, w12.0, 0
bn.mulqacc w10.0, w13.1, 64
bn.mulqacc w10.1, w13.0, 64
bn.mulqacc w10.2, w12.3, 64
bn.mulqacc w10.3, w12.2, 64
bn.mulqacc w11.0, w12.1, 64
bn.mulqacc.so w17.L, w11.1, w12.0, 64
bn.mulqacc w10.1, w13.1, 0
bn.mulqacc w10.2, w13.0, 0
bn.mulqacc w10.3, w12.3, 0
bn.mulqacc w11.0, w12.2, 0
bn.mulqacc w11.1, w12.1, 0
bn.mulqacc w10.2, w13.1, 64
bn.mulqacc w10.3, w13.0, 64
bn.mulqacc w11.0, w12.3, 64
bn.mulqacc.so w17.U, w11.1, w12.2, 64
/* Compute r = x-r2 = x-q3*p
since 0 <= r < 3*p, we only need to consider the lower limbs of x and r2
r[511:0] = [w22, w21] - [w17, w16] */
bn.sub w21, w21, w16
bn.subb w22, w22, w17
/* Barrett algorithm requires subtraction of the modulus at most two times if
result is too large. However in the special case of P-384 we need to
subtract only once */
bn.sub w16, w21, w12
bn.subb w17, w22, w13
bn.sel w16, w21, w16, C
bn.sel w17, w22, w17, C
/* return result: c =[w17, w16] = a * b % p. */
ret
/**
* P-384 point addition in projective space
*
* returns R = (x_r, y_r, z_r) <= P+Q = (x_p, y_p, z_p) + (x_q, y_q, z_q)
* with R, P and Q being valid P-384 curve points
* in projective coordinates
*
* This routine adds two valid P-384 curve points in projective space.
* Point addition is performed based on the complete formulas of Bosma and
* Lenstra for Weierstrass curves as first published in [1] and
* optimized in [2].
* The implemented version follows Algorithm 4 of [2] which is an optimized
* variant for Weierstrass curves with domain parameter 'a' set to a=-3.
* Numbering of the steps below and naming of symbols follows the
* terminology of Algorithm 4 of [2].
* The routine is limited to P-384 curve points due to:
* - fixed a=-3 domain parameter
* - usage of a P-384 optimized Barrett multiplication kernel
* This routine runs in constant time.
*
* [1] https://doi.org/10.1006/jnth.1995.1088
* [2] https://doi.org/10.1007/978-3-662-49890-3_16
*
* @param[in] x22: set to 10, pointer to in reg for Barrett routine
* @param[in] x23: set to 11, pointer to in reg for Barrett routine
* @param[in] x24: set to 16, pointer to in/out reg for Barrett routine
* @param[in] x25: set to 17, pointer to in/out reg for Barrett routine
* @param[in] x26: dptr_p_p, dmem pointer to point P in dmem (projective)
* @param[in] x27: dptr_q_p, dmem pointer to point Q in dmem (projective)
* @param[in] x28: dptr_b, dmem pointer to domain parameter b of P-384 in dmem
* @param[in] [w13, w12]: p, modulus of underlying field of P-384
* @param[in] [w15, w14]: u[383:0] lower 384 bit of Barrett constant u for
* modulus p
* @param[in] w31: all-zero.
* @param[out] [w26, w25]: x_r, x-coordinate of resulting point R
* @param[out] [w28, w27]: y_r, y-coordinate of resulting point R
* @param[out] [w30, w29]: z_r, z-coordinate of resulting point R
*
* Flags: When leaving this subroutine, flags of FG0 depend on an
* intermediate result and are not usable after return.
* FG1 is not modified in this subroutine.
*
* clobbered registers: w0 to w30
* clobbered flag groups: FG0
*/
.globl proj_add_p384
proj_add_p384:
/* mapping of parameters to symbols of [2] (Algorithm 4):
X1 = x_p; Y1 = y_p; Z1 = z_p; X2 = x_q; Y2 = y_q; Z2 = z_q
X3 = x_r; Y3 = y_r; Z3 = z_r */
/* 1: [w1, w0] = t0 <= X1*X2 = dmem[x26+0]*dmem[x27+0] */
bn.lid x22, 0(x26)
bn.lid x23, 32(x26)
bn.lid x24, 0(x27)
bn.lid x25, 32(x27)
jal x1, barrett384_p384
bn.mov w0, w16
bn.mov w1, w17
/* 2: [w3, w2] = t1 <= Y1*Y2 = dmem[x26+64]*dmem[x27+64] */
bn.lid x22, 64(x26)
bn.lid x23, 96(x26)
bn.lid x24, 64(x27)
bn.lid x25, 96(x27)
jal x1, barrett384_p384
bn.mov w2, w16
bn.mov w3, w17
/* 3: [w5, w4] = t2 <= Z1*Z2 = dmem[x26+128]*dmem[x27+128] */
bn.lid x22, 128(x26)
bn.lid x23, 160(x26)
bn.lid x24, 128(x27)
bn.lid x25, 160(x27)
jal x1, barrett384_p384
bn.mov w4, w16
bn.mov w5, w17
/* 4: [w7, w6] = t3 <= X1+Y1 = dmem[x26+0]+dmem[x26+64] */
bn.lid x22, 0(x26)
bn.lid x23, 32(x26)
bn.lid x24, 64(x26)
bn.lid x25, 96(x26)
bn.add w16, w10, w16
bn.addc w17, w11, w17
bn.sub w10, w16, w12
bn.subb w11, w17, w13
bn.sel w16, w16, w10, C
bn.sel w17, w17, w11, C
bn.mov w6, w16
bn.mov w7, w17
/* 5: [w9, w8] = t4 <= X2+Y2 = dmem[x27+0]+dmem[x27+64] */
bn.lid x22, 0(x27)
bn.lid x23, 32(x27)
bn.lid x24, 64(x27)
bn.lid x25, 96(x27)
bn.add w16, w10, w16
bn.addc w17, w11, w17
bn.sub w10, w16, w12
bn.subb w11, w17, w13
bn.sel w16, w16, w10, C
bn.sel w17, w17, w11, C
bn.mov w8, w16
bn.mov w9, w17
/* 6: [w7, w6] = t3 <= t3*t4 = [w7, w6]*[w9, w8] */
bn.mov w10, w6
bn.mov w11, w7
bn.mov w16, w8
bn.mov w17, w9
jal x1, barrett384_p384
bn.mov w6, w16
bn.mov w7, w17
/* 7: [w9, w8] = t4 <= t0+t1 = [w1, w0]+[w3, w2] */
bn.add w16, w0, w2
bn.addc w17, w1, w3
bn.sub w10, w16, w12
bn.subb w11, w17, w13
bn.sel w16, w16, w10, C
bn.sel w17, w17, w11, C
bn.mov w8, w16
bn.mov w9, w17
/* 8: [w7, w6] = t3 <= t3-t4 = [w7, w6]-[w9, w8] */
bn.sub w16, w6, w8
bn.subb w17, w7, w9
bn.add w10, w16, w12
bn.addc w11, w17, w13
bn.sel w16, w10, w16, C
bn.sel w17, w11, w17, C
bn.mov w6, w16
bn.mov w7, w17
/* 9: [w9, w8] = t4 <= Y1+Z1 = dmem[x26+64]+dmem[x26+128] */
bn.lid x22, 64(x26)
bn.lid x23, 96(x26)
bn.lid x24, 128(x26)
bn.lid x25, 160(x26)
bn.add w16, w10, w16
bn.addc w17, w11, w17
bn.sub w10, w16, w12
bn.subb w11, w17, w13
bn.sel w16, w16, w10, C
bn.sel w17, w17, w11, C
bn.mov w8, w16
bn.mov w9, w17
/* 10: [w26, w25] = X3 <= Y2+Z2 = dmem[x27+64]+dmem[x27+128] */
bn.lid x22, 64(x27)
bn.lid x23, 96(x27)
bn.lid x24, 128(x27)
bn.lid x25, 160(x27)
bn.add w16, w10, w16
bn.addc w17, w11, w17
bn.sub w10, w16, w12
bn.subb w11, w17, w13
bn.sel w16, w16, w10, C
bn.sel w17, w17, w11, C
bn.mov w25, w16
bn.mov w26, w17
/* 11: [w9, w8] = t4 <= t4*X3 = [w9, w8]*[w26, w25] */
bn.mov w10, w8
bn.mov w11, w9
bn.mov w16, w25
bn.mov w17, w26
jal x1, barrett384_p384
bn.mov w8, w16
bn.mov w9, w17
/* 12: [w26, w25] = X3 <= t1+t2 = [w3, w2]+[w5, w4] */
bn.add w16, w2, w4
bn.addc w17, w3, w5
bn.sub w10, w16, w12
bn.subb w11, w17, w13
bn.sel w16, w16, w10, C
bn.sel w17, w17, w11, C
bn.mov w25, w16
bn.mov w26, w17
/* 13: [w9, w8] = t4 <= t4-X3 = [w9, w8]-[w26, w25] */
bn.sub w16, w8, w25
bn.subb w17, w9, w26
bn.add w10, w16, w12
bn.addc w11, w17, w13
bn.sel w16, w10, w16, C
bn.sel w17, w11, w17, C
bn.mov w8, w16
bn.mov w9, w17
/* 14: [w26, w25] = X3 <= X1+Z1 = dmem[x26+0]+dmem[x26+128] */
bn.lid x22, 0(x26)
bn.lid x23, 32(x26)
bn.lid x24, 128(x26)
bn.lid x25, 160(x26)
bn.add w16, w10, w16
bn.addc w17, w11, w17
bn.sub w10, w16, w12
bn.subb w11, w17, w13
bn.sel w16, w16, w10, C
bn.sel w17, w17, w11, C
bn.mov w25, w16
bn.mov w26, w17
/* 15: [w28, w27] = Y3 <= X2+Z2 = dmem[x27+0]+dmem[x27+128] */
bn.lid x22, 0(x27)
bn.lid x23, 32(x27)
bn.lid x24, 128(x27)
bn.lid x25, 160(x27)
bn.add w16, w10, w16
bn.addc w17, w11, w17
bn.sub w10, w16, w12
bn.subb w11, w17, w13
bn.sel w16, w16, w10, C
bn.sel w17, w17, w11, C
bn.mov w27, w16
bn.mov w28, w17
/* 16: [w26, w25] = X3 <= X3*Y3 = [w26, w25]*[w28, w27] */
bn.mov w10, w25
bn.mov w11, w26
bn.mov w16, w27
bn.mov w17, w28
jal x1, barrett384_p384
bn.mov w25, w16
bn.mov w26, w17
/* 17: [w28, w27] = Y3 <= t0+t2 = [w1, w0]+[w5, w4] */
bn.add w16, w0, w4
bn.addc w17, w1, w5
bn.sub w10, w16, w12
bn.subb w11, w17, w13
bn.sel w16, w16, w10, C
bn.sel w17, w17, w11, C
bn.mov w27, w16
bn.mov w28, w17
/* 18: [w28, w27] = Y3 <= X3-Y3 = [w26, w25]-[w28, w27] */
bn.sub w16, w25, w27
bn.subb w17, w26, w28
bn.add w10, w16, w12
bn.addc w11, w17, w13
bn.sel w16, w10, w16, C
bn.sel w17, w11, w17, C
bn.mov w27, w16
bn.mov w28, w17
/* 19: [w30, w29] = Z3 <= b*t2 = dmem[x28+0]*[w5, w4] */
bn.lid x22, 0(x28)
bn.lid x23, 32(x28)
bn.mov w16, w4
bn.mov w17, w5
jal x1, barrett384_p384
bn.mov w29, w16
bn.mov w30, w17
/* 20: [w26, w25] = X3 <= Y3-Z3 = [w28, w27]-[w30, w29] */
bn.sub w16, w27, w29
bn.subb w17, w28, w30
bn.add w10, w16, w12
bn.addc w11, w17, w13
bn.sel w16, w10, w16, C
bn.sel w17, w11, w17, C
bn.mov w25, w16
bn.mov w26, w17
/* 21: [w30, w29] = Z3 <= X3+X3 = [w26, w25]+[w26, w25] */
bn.add w16, w25, w25
bn.addc w17, w26, w26
bn.sub w10, w16, w12
bn.subb w11, w17, w13
bn.sel w16, w16, w10, C
bn.sel w17, w17, w11, C
bn.mov w29, w16
bn.mov w30, w17
/* 22: [w26, w25] = X3 <= X3+Z3 = [w26, w25]+[w30, w29] */
bn.add w16, w25, w29
bn.addc w17, w26, w30
bn.sub w10, w16, w12
bn.subb w11, w17, w13
bn.sel w16, w16, w10, C
bn.sel w17, w17, w11, C
bn.mov w25, w16
bn.mov w26, w17
/* 23: [w30, w29] = Z3 <= t1-X3 = [w3, w2]-[w26, w25] */
bn.sub w16, w2, w25
bn.subb w17, w3, w26
bn.add w10, w16, w12
bn.addc w11, w17, w13
bn.sel w16, w10, w16, C
bn.sel w17, w11, w17, C
bn.mov w29, w16
bn.mov w30, w17
/* 24: [w26, w25] = X3 <= t1+X3 = [w3, w2]+[w26, w25] */
bn.add w16, w2, w25
bn.addc w17, w3, w26
bn.sub w10, w16, w12
bn.subb w11, w17, w13
bn.sel w16, w16, w10, C
bn.sel w17, w17, w11, C
bn.mov w25, w16
bn.mov w26, w17
/* 25: [w28, w27] = Y3 <= b*Y3 = dmem[x28+0]*[w28, w27] */
bn.lid x22, 0(x28)
bn.lid x23, 32(x28)
bn.mov w16, w27
bn.mov w17, w28
jal x1, barrett384_p384
bn.mov w27, w16
bn.mov w28, w17
/* 26: [w3, w2] = t1 <= t2+t2 = [w5, w4]+[w5, w4] */
bn.add w16, w4, w4
bn.addc w17, w5, w5
bn.sub w10, w16, w12
bn.subb w11, w17, w13
bn.sel w16, w16, w10, C
bn.sel w17, w17, w11, C
bn.mov w2, w16
bn.mov w3, w17
/* 27: [w5, w4] = t2 <= t1+t2 = [w3, w2]+[w5, w4] */
bn.add w16, w2, w4
bn.addc w17, w3, w5
bn.sub w10, w16, w12
bn.subb w11, w17, w13
bn.sel w16, w16, w10, C
bn.sel w17, w17, w11, C
bn.mov w4, w16
bn.mov w5, w17
/* 28: [w28, w27] = Y3 <= Y3-t2 = [w28, w27]-[w5, w4] */
bn.sub w16, w27, w4
bn.subb w17, w28, w5
bn.add w10, w16, w12
bn.addc w11, w17, w13
bn.sel w16, w10, w16, C
bn.sel w17, w11, w17, C
bn.mov w27, w16
bn.mov w28, w17
/* 29: [w28, w27] = Y3 <= Y3-t0 = [w28, w27]-[w1, w0] */
bn.sub w16, w27, w0
bn.subb w17, w28, w1
bn.add w10, w16, w12
bn.addc w11, w17, w13
bn.sel w16, w10, w16, C
bn.sel w17, w11, w17, C
bn.mov w27, w16
bn.mov w28, w17
/* 30: [w3, w2] = t1 <= Y3+Y3 = [w28, w27]+[w28, w27] */
bn.add w16, w27, w27
bn.addc w17, w28, w28
bn.sub w10, w16, w12
bn.subb w11, w17, w13
bn.sel w16, w16, w10, C
bn.sel w17, w17, w11, C
bn.mov w2, w16
bn.mov w3, w17
/* 31: [w28, w27] = Y3 <= t1+Y3 = [w3, w2]+[w28, w27] */
bn.add w16, w2, w27
bn.addc w17, w3, w28
bn.sub w10, w16, w12
bn.subb w11, w17, w13
bn.sel w16, w16, w10, C
bn.sel w17, w17, w11, C
bn.mov w27, w16
bn.mov w28, w17
/* 32: [w3, w2] = t1 <= t0+t0 = [w1, w0]+[w1, w0] */
bn.add w16, w0, w0
bn.addc w17, w1, w1
bn.sub w10, w16, w12
bn.subb w11, w17, w13
bn.sel w16, w16, w10, C
bn.sel w17, w17, w11, C
bn.mov w2, w16
bn.mov w3, w17
/* 33: [w1, w0] = t0 <= t1+t0 = [w3, w2]+[w1, w0] */
bn.add w16, w2, w0
bn.addc w17, w3, w1
bn.sub w10, w16, w12
bn.subb w11, w17, w13
bn.sel w16, w16, w10, C
bn.sel w17, w17, w11, C
bn.mov w0, w16
bn.mov w1, w17
/* 34: [w1, w0] = t0 <= t0-t2 = [w1, w0]-[w5, w4] */
bn.sub w16, w0, w4
bn.subb w17, w1, w5
bn.add w10, w16, w12
bn.addc w11, w17, w13
bn.sel w16, w10, w16, C
bn.sel w17, w11, w17, C
bn.mov w0, w16
bn.mov w1, w17
/* 35: [w3, w2] = t1 <= t4*Y3 = [w9, w8]*[w28, w27] */
bn.mov w10, w8
bn.mov w11, w9
bn.mov w16, w27
bn.mov w17, w28
jal x1, barrett384_p384
bn.mov w2, w16
bn.mov w3, w17
/* 36: [w5, w4] = t2 <= t0*Y3 = [w1, w0]*[w28, w27] */
bn.mov w10, w0
bn.mov w11, w1
bn.mov w16, w27
bn.mov w17, w28
jal x1, barrett384_p384
bn.mov w4, w16
bn.mov w5, w17
/* 37: [w28, w27] = Y3 <= X3*Z3 = [w26, w25]*[w30, w29] */
bn.mov w10, w25
bn.mov w11, w26
bn.mov w16, w29
bn.mov w17, w30
jal x1, barrett384_p384
bn.mov w27, w16
bn.mov w28, w17
/* 38: [w28, w27] = Y3 <= Y3+t2 = [w28, w27]+[w5, w4] */
bn.add w16, w27, w4
bn.addc w17, w28, w5
bn.sub w10, w16, w12
bn.subb w11, w17, w13
bn.sel w16, w16, w10, C
bn.sel w17, w17, w11, C
bn.mov w27, w16
bn.mov w28, w17
/* 39: [w26, w25] = X3 <= t3*X3 = [w7, w6]*[w26, w25] */
bn.mov w10, w6
bn.mov w11, w7
bn.mov w16, w25
bn.mov w17, w26
jal x1, barrett384_p384
bn.mov w25, w16
bn.mov w26, w17
/* 40: [w26, w25] = X3 <= X3-t1 = [w26, w25]-[w3, w2] */
bn.sub w16, w25, w2
bn.subb w17, w26, w3
bn.add w10, w16, w12
bn.addc w11, w17, w13
bn.sel w16, w10, w16, C
bn.sel w17, w11, w17, C
bn.mov w25, w16
bn.mov w26, w17
/* 41: [w30, w29] = Z3 <= t4*Z3 = [w9, w8]*[w30, w29] */
bn.mov w10, w8
bn.mov w11, w9
bn.mov w16, w29
bn.mov w17, w30
jal x1, barrett384_p384
bn.mov w29, w16
bn.mov w30, w17
/* 42: [w3, w2] = t1 <= t3*t0 = [w7, w6]*[w1, w0] */
bn.mov w10, w6
bn.mov w11, w7
bn.mov w16, w0
bn.mov w17, w1
jal x1, barrett384_p384
bn.mov w2, w16
bn.mov w3, w17
/* 43: [w30, w29] = Z3 <= Z3+t1 = [w30, w29]+[w3, w2] */
bn.add w16, w29, w2
bn.addc w17, w30, w3
bn.sub w10, w16, w12
bn.subb w11, w17, w13
bn.sel w16, w16, w10, C
bn.sel w17, w17, w11, C
bn.mov w29, w16
bn.mov w30, w17
ret
/**
* Convert projective coordinates of a P-384 curve point to affine coordinates
*
* returns P = (x_a, y_a) = (x/z mod p, y/z mod p)
* where P is a valid P-384 curve point,
* x_a and y_a are the resulting affine coordinates of the
* curve point,
* x,y and z are a set of projective coordinates of the
* point and
* p is the modulus of the P-384 underlying finite field.
*
* This routine computes the affine coordinates for a set of projective
* coordinates of a valid P-384 curve point. The routine performs the required
* divisions by computing the multiplicative modular inverse of the
* projective z-coordinate in the underlying finite field of the P-384 curve.
* For inverse computation Fermat's little theorem is used, i.e.
* we compute z^-1 = z^(p-2) mod p.
* For exponentiation a 16 step addition chain is used.
* Source of the addition chain is the addchain project:
* https://github.com/mmcloughlin/addchain/
*
* Flags: Flags when leaving this subroutine depend on a potentially discarded
* value and therefore are not usable after return.
*
* @param[in] [w26,w25]: x, x-coordinate of curve point (projective).
* @param[in] [w26,w25]: y, y-coordinate of curve point (projective).
* @param[in] [w30,w29]: z, z-coordinate of curve point (projective).
* @param[in] [w13, w12]: p, modulus of P-384.
* @param[in] [w15, w14]: u, pre-computed Barrett constant for p,
* lower 384 bits, i.e. (2^(2*384) div p)[383:0].
* @param[in] w31: all-zero.
* @param[out] [w26, w25]: x_a, affine x-coordinate of resulting point.
* @param[out] [w28, w27]: y_a, affine y-coordinate of resulting point.
*
* clobbered registers: w0 to w28
* clobbered flag groups: FG0
*/
proj_to_affine_p384:
/* Exp: 0b10 = 2*0b1
Val: r10 = z^2 mod p
[w17,w16] <= [w30,w29]^2 mod [w13,w12] */
bn.mov w10, w29
bn.mov w11, w30
bn.mov w16, w29
bn.mov w17, w30
jal x1, barrett384_p384
/* Exp: 0b11 = 0b1+0b10
Val: r11 <= z*r10 mod p
[w17,w16] <= [w30,w29]*[w17,w16] mod [w13,w12] */
bn.mov w10, w29
bn.mov w11, w30
jal x1, barrett384_p384
/* Exp: 0b110 = 2*0b11
Val: r110 = r11^2 mod p
[w17,w16] <= [w17,w16]^2 mod [w13,w12] */
bn.mov w10, w16
bn.mov w11, w17
jal x1, barrett384_p384
/* Exp: 0b111 = 0b1+0b110
Val: r111 <= z*r110 mod p
[w1,w0] = [w17,w16] <= [w30,w29]*[w17,w16] mod [w13,w12] */
bn.mov w10, w29
bn.mov w11, w30
jal x1, barrett384_p384
bn.mov w0, w16
bn.mov w1, w17
/* Exp: 0b111000 = 0b111<<3
Val: r111000 <= r111^(2^3) mod p
[w17,w16] <= [w17,w16]^(2^3) mod [w13,w12] */
loopi 3, 4
bn.mov w10, w16
bn.mov w11, w17
jal x1, barrett384_p384
nop
/* Exp: 0b1111111 = 0b111+0b111000
Val: r1111111 <= r111*r111000 mod p
[w3,w2] = [w17,w16] <= [w1,w0]*[w17,w16] mod [w13,w12] */
bn.mov w10, w0
bn.mov w11, w1
jal x1, barrett384_p384
bn.mov w2, w16
bn.mov w3, w17
/* Exp: 2^12-1 = (0b1111111<<6)+0b111111
Val: r_12_1 <= r111111^(2^6)*r111111 mod p
[w5,w4] = [w17,w16] <= [w17,w16]^(2^6)*[w17,w16] mod [w13,w12] */
loopi 6, 4
bn.mov w10, w16
bn.mov w11, w17
jal x1, barrett384_p384
nop
bn.mov w10, w2
bn.mov w11, w3
jal x1, barrett384_p384
bn.mov w4, w16
bn.mov w5, w17
/* Exp: 2^24-1 = ((2^12-1)<<12)+(2^12-1)
Val: r_24_1 <= r_12_1^(2^12)*r12_1 mod p
[w17,w16] <= [w17,w16]^(2^12)*[w5,w4] mod [w13,w12] */
loopi 12, 4
bn.mov w10, w16
bn.mov w11, w17
jal x1, barrett384_p384
nop
bn.mov w10, w4
bn.mov w11, w5
jal x1, barrett384_p384
/* Exp: 2^30-1 = ((2^24-1)<<6)+0b111111
Val: r_30_1 <= r_24_1^(2^6)*r111111 mod p
[w3, w2] = [w17,w16] <= [w17,w16]^(2^6)*[w3,w2] mod [w13,w12] */
loopi 6, 4
bn.mov w10, w16
bn.mov w11, w17
jal x1, barrett384_p384
nop
bn.mov w10, w2
bn.mov w11, w3
jal x1, barrett384_p384
bn.mov w2, w16
bn.mov w3, w17
/* Exp: 2^31-1 <= (2^30-1)*2+0b1
Val: r_31_1 <= r30_1^2*z mod p
[w7,w6] = [w17,w16] <= [w17,w16]^2*[w30,w29] mod [w13,w12] */
bn.mov w10, w16
bn.mov w11, w17
jal x1, barrett384_p384
bn.mov w10, w29
bn.mov w11, w30
jal x1, barrett384_p384
bn.mov w6, w16
bn.mov w7, w17
/* Exp: 2^32-1 <= (2^30-1)*2+0b1
Val: r_32_1 <= r31_1^2*z mod p
[w9,w8] = [w17,w16] <= [w17,w16]^2*[w30,w29] mod [w13,w12] */
bn.mov w10, w16
bn.mov w11, w17
jal x1, barrett384_p384
bn.mov w10, w29
bn.mov w11, w30
jal x1, barrett384_p384
bn.mov w9, w16
bn.mov w8, w17
/* Exp: 2^63-1 <= ((2^32-1)<<31)+(2^31-1)
Val: r_63_1 <= r_32_1^(2^31)*r_31_1 mod p
[w7,w6] = [w17,w16] <= [w17,w16]^(2^31)*[w7,w6] mod [w13,w12] */
loopi 31, 4
bn.mov w10, w16
bn.mov w11, w17
jal x1, barrett384_p384
nop
bn.mov w10, w6
bn.mov w11, w7
jal x1, barrett384_p384
bn.mov w6, w16
bn.mov w7,w17
/* Exp: 2^126-1 = ((2^63-1)<<63) + (2^63-1)
Val: r_126_1 <= r_63_1^(2^63)*r_63_1 mod p
[w7,w6] = [w17,w16] <= [w17,w16]^(2^63)*[w7,w6] mod [w13,w12] */
loopi 63, 4
bn.mov w10, w16
bn.mov w11, w17
jal x1, barrett384_p384
nop
bn.mov w10, w6
bn.mov w11, w7
jal x1, barrett384_p384
bn.mov w6, w16
bn.mov w7, w17
/* Exp: 2^252-1 = ((2^126-1)<<126)+(2^126-1)
Val: r_252_1 <= r_126_1^(2^63)*r_126_1 mod p
[w17,w16] <= [w17,w16]^(2^126)*[w7,w6] mod [w13,w12] */
loopi 126, 4
bn.mov w10, w16
bn.mov w11, w17
jal x1, barrett384_p384
nop
bn.mov w10, w6
bn.mov w11, w7
jal x1, barrett384_p384
/* Exp: 2^255-1 = ((2^252-1)<<3)+0b111
Val: r_255_1 <= r_252_1^(2^3)*r111 mod p
[w17,w16] <= [w17,w16]^(2^3)*[w1,w0] mod [w13,w12] */
loopi 3, 4
bn.mov w10, w16
bn.mov w11, w17
jal x1, barrett384_p384
nop
bn.mov w10, w0
bn.mov w11, w1
jal x1, barrett384_p384
/* Exp: p-2 = ((((((2^255-1)<<33)+(2^32-1))<<94)+(2^30-1))<<2)+0b1
Val: x_inv <=((r_255_1^(2^33)*r_32_1)^(2^94)*r_30_1)^(2^2)*z mod p
[w17,w16] <= (([w17,w16]^(2^33)*[w9,w8])^(2^94)*[w3,w2])^(2^2)
*[w30,w29] mod [w13,w12] */
loopi 33, 4
bn.mov w10, w16
bn.mov w11, w17
jal x1, barrett384_p384
nop
bn.mov w10, w9
bn.mov w11, w8
jal x1, barrett384_p384
loopi 94, 4
bn.mov w10, w16
bn.mov w11, w17
jal x1, barrett384_p384
nop
bn.mov w10, w2
bn.mov w11, w3
jal x1, barrett384_p384
loopi 2, 4
bn.mov w10, w16
bn.mov w11, w17
jal x1, barrett384_p384
nop
bn.mov w10, w29
bn.mov w11, w30
jal x1, barrett384_p384
/* store inverse [w1,w0] <= [w17,w16] = z_inv*/
bn.mov w0, w16
bn.mov w1, w17
/* convert x-coordinate to affine space
[w26,w25] <= [w17,w16] = x_a <= x/z = x*z_inv = [w26,w25]*[w1,w0] mod p */
bn.mov w10, w25
bn.mov w11, w26
jal x1, barrett384_p384
bn.mov w25, w16
bn.mov w26, w17
/* convert y-coordinate to affine space
[w28,w27] <= [w17,w16] = y_a <= y/z = y*z_inv = [w28,w27]*[w1,w0] mod p */
bn.mov w10, w27
bn.mov w11, w28
bn.mov w16, w0
bn.mov w17, w1
jal x1, barrett384_p384
bn.mov w27, w16
bn.mov w28, w17
ret
/**
* Fetch curve point from dmem, randomize z-coordinate and store point in dmem
*
* returns P = (x, y, z) = (x_a*z, y_a*z, z)
* with P being a valid P-384 curve point in projective coordinates
* x_a and y_a being the affine coordinates as fetched from dmem
* z being a randomized z-coordinate
*
* This routines fetches the affine x- and y-coordinates of a curve point from
* dmem and computes a valid set of projective coordinates. The z-coordinate is
* randomized and x and y are scaled appropriately. The resulting projective
* coordinates are stored at dmem[dptr_p_p] using 6 consecutive 256-bit cells,
* i.e. each coordinate is stored 512 bit aligned, little endian.
* This routine runs in constant time.
*
* @param[in] x20: dptr_x, pointer to dmem location containing affine
* x-coordinate of input point
* @param[in] x21: dptr_y, pointer to dmem location containing affine
* y-coordinate of input point
* @param[in] [w15, w14]: u[383:0] lower 384 bit of Barrett constant u for
* modulus p
* @param[in] [w13, w12]: p, modulus of P-384 underlying finite field
* @param[in] w31: all-zero
* @param[in] x18: dptr_p_p, pointer to dmem location to store resulting point
* in projective space
*
* Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
* the upper limb of projective y-coordinate.
*
* clobbered registers: x10, x11 to x13
* w2, w3, w8 to w11, w16 to w24, w29, w30
* clobbered flag groups: FG0
*/
store_proj_randomize:
/* get a 384-bit random number
[w3, w2] = random(384) */
bn.wsrr w2, 1
bn.wsrr w3, 1
bn.rshi w3, w31, w3 >> 128
/* reduce random number
[w2, w3] = z <= [w2, w3] mod p */
bn.sub w10, w2, w12
bn.subb w11, w3, w13
bn.sel w2, w2, w10, C
bn.sel w3, w3, w11, C
bn.mov w10, w2
bn.mov w11, w3
/* store z-coordinate
dmem[x20+128] = [w10, w11] */
li x10, 10
li x11, 11
bn.sid x10, 128(x18)
bn.sid x11, 160(x18)
/* fetch x-coordinate from dmem
[w16, w17] = x <= [dmem[dptr_x], dmem[dptr_x+32]] */
li x12, 16
li x13, 17
bn.lid x12, 0(x20)
bn.lid x13, 32(x20)
/* scale and store x-coordinate
[dmem[dptr_p_p], dmem[dptr_p_p+32]] = [w17, w16] =
x_p <= [w11, w10] * [w17, w16] = z*x mod p */
jal x1, barrett384_p384
bn.sid x12, 0(x18)
bn.sid x13, 32(x18)
/* fetch y-coordinate from dmem
[w11, w10] = x <= [dmem[dptr_y], dmem[dptr_y+32]] */
bn.lid x12, 0(x21)
bn.lid x13, 32(x21)
/* scale and store y-coordinate
[dmem[dptr_p_p+64], dmem[dptr_p_p+96]] = [w17, w16] =
y_p <= [w11, w10] * [w17, w16] = z*y mod p */
bn.mov w10, w2
bn.mov w11, w3
jal x1, barrett384_p384
bn.sid x12, 64(x18)
bn.sid x13, 96(x18)
ret
/**
* P-256 scalar point multiplication in affine space
*
* returns R = d*P = d*(x_p, y_p)
* where R, P are valid P-256 curve points in affine coordinates,
* d is a 256-bit scalar.
*
* This routine performs scalar multiplication based on the group laws
* of Weierstrass curves.
* A constant time double-and-add algorithm (sometimes referred to as
* double-and-add-always) is used.
* Due to the P-384 optimized implementations of the internally called routines
* for point addition and doubling, this routine is limited to P-384 curves.
* The routine makes use of blinding by additive splitting the
* exponent/scalar d into two shares. The double-and-add loop operates on both
* shares in parallel applying Shamir's trick.
*
* @param[in] x9: dptr_rnd, pointer to location in dmem containing random
* number to be used for additive splitting of scalar
* @param[in] x19: dptr_d, pointer to scalar d (0 < d < n) in dmem
* @param[in] x20: dptr_x, pointer to affine x-coordinate in dmem
* @param[in] x21: dptr_y, pointer to affine y-coordinate in dmem
* @param[in] x28: dptr_b, pointer to domain parameter b of P-384 in dmem
* @param[in] x30: dptr_sp, pointer to 704 bytes of scratchpad memory in dmem
* @param[in] [w15, w14]: u[383:0] lower 384 bit of Barrett constant u
* corresponding to modulus p
* @param[in] [w13, w12]: p, modulus of P-384 underlying finite field
* @param[in] [w11, w10]: n, domain parameter of P-384 curve
* (order of basepoint G)
* @param[in] w31: all-zero
* @param[out] [w26, w25]: x_a, affine x-coordinate of resulting point R.
* @param[out] [w28, w26]: y_a, affine y-coordinate of resulting point R.
*
* Scratchpad memory layout:
* The routine expects at least 704 bytes of scratchpad memory at dmem
* location dptr_sp. Internally the scratchpad is used as follows:
* dptr_sp .. dptr_sp+192: point P, projective
* dptr_sp+192 .. dptr_sp+255: s0, 1st share of scalar
* dptr_sp+256 .. dptr_sp+447: point 2P, projective
* dptr_sp+448 .. dptr_sp+511: s1, 2nd share of scalar
* dptr_sp+512 .. dptr_sp+703: point Q, projective
*
* Projective coordinates of a point are kept in dmem in little endian format
* with the individual coordinates 512 bit aligned. The coordinates are stored
* in x,y,z order (i.e. x at lowest, z at highest address). Thus, a 384 bit
* curve point occupies 6 consecutive 256-bit dmem cells.
*
* Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
* the computed affine y-coordinate.
*
* clobbered registers: x2, x10, x11 to x13, x18, x26, x27, w0 to w30
* clobbered flag groups: FG0
*/
.globl scalar_mult_int_p384
scalar_mult_int_p384:
/* set regfile pointers to in/out regs of Barrett routine. Set here to avoid
resetting in very call to point addition routine */
li x22, 10
li x23, 11
li x24, 16
li x25, 17
/* fetch externally supplied random number from dmem
[w1, w0] = dmem[dptr_rnd] = [dmem[x9], dmem[x9+32]] = rnd */
li x2, 0
bn.lid x2++, 0(x9)
bn.lid x2++, 32(x9)
/* 1st share (reduced rnd)
s0 = [w1, w0] <= rnd mod n = [w1, w0] mod [w11, w10] */
bn.sub w9, w0, w10
bn.subb w8, w1, w11
bn.sel w0, w0, w9, C
bn.sel w1, w1, w8, C
/* load scalar d from dmem
[w3, w2] = d <= dmem[dptr_d] = [dmem[x19], dmem[x19+32]] */
bn.lid x2++, 0(x19)
bn.lid x2, 32(x19)
/* 2nd share (d-s0)
s1 = [w3, w2] <= d - s0 mod n = [w2, w3] - [w1, w0] mod [w11, w10] */
bn.sub w2, w2, w0
bn.subb w3, w3, w1
bn.add w8, w2, w10
bn.addc w9, w3, w11
bn.sel w2, w8, w2, C
bn.sel w3, w9, w3, C
/* left align both shares for probing of MSB in loop body */
bn.rshi w1, w1, w0 >> 128
bn.rshi w0, w0, w31 >> 128
bn.rshi w3, w3, w2 >> 128
bn.rshi w2, w2, w31 >> 128
/* store shares in scratchpad */
li x2, 0
bn.sid x2++, 192(x30)
bn.sid x2++, 224(x30)
bn.sid x2++, 448(x30)
bn.sid x2++, 480(x30)
/* get randomized projective coodinates of curve point
P = (x_p, y_p, z_p) = dmem[dptr_sp] = (x*z mod p, y*z mod p, z) */
add x18, x30, 0
jal x1, store_proj_randomize
/* double point P
2P = ([w30,w29], [w28,w27], [w26, w25]) <= 2*P */
add x27, x30, x0
add x26, x30, x0
jal x1, proj_add_p384
/* store point 2P in scratchpad @w30+256
dmem[dptr_sc+256] = [w30:w25] = 2P */
li x2, 25
bn.sid x2++, 256(x30)
bn.sid x2++, 288(x30)
bn.sid x2++, 320(x30)
bn.sid x2++, 352(x30)
bn.sid x2++, 384(x30)
bn.sid x2++, 416(x30)
/* init point Q = (0,1,0) for double-and-add in scratchpad */
/* dmem[x26] = dmem[dptr_sc+512] = Q = (0,1,0) */
addi x26, x30, 512
li x2, 30
bn.addi w30, w31, 1
bn.sid x2++, 64(x26)
bn.sid x2, 0(x26)
bn.sid x2, 32(x26)
bn.sid x2, 96(x26)
bn.sid x2, 128(x26)
bn.sid x2, 160(x26)
/* double-and-add loop with decreasing index */
loopi 384, 85
/* double point Q
Q = ([w30,w29], [w28,w27], [w26, w25]) <= Q + dmem[x27] */
add x27, x26, x0
jal x1, proj_add_p384
/* store Q in dmem
dmem[x26] = dmem[dptr_sc+512] <= [w30:w25] */
li x2, 25
bn.sid x2++, 0(x26)
bn.sid x2++, 32(x26)
bn.sid x2++, 64(x26)
bn.sid x2++, 96(x26)
bn.sid x2++, 128(x26)
bn.sid x2++, 160(x26)
/* Probe if MSb of either of the two scalars (rnd or d-rnd) but not both
is 1.
If only one MSb is set, select P for addition.
If both MSbs are set, select 2P for addition.
(If neither MSB is set, 2P will be selected but result discarded.) */
li x2, 0
bn.lid x2++, 224(x30)
bn.lid x2, 480(x30)
bn.xor w8, w0, w1
/* Create conditional offeset into scratchpad.
if (s0[512] xor s1[512]) x27 <= x30 else x27 <= x30+256 */
csrrs x3, 0x7c0, x0
xori x3, x3, -1
andi x3, x3, 2
slli x27, x3, 7
add x27, x27, x30
/* Reload randomized projective coodinates for curve point P.
P = (x_p, y_p, z_p) = dmem[dptr_sp] <= (x*z mod p, y*z mod p, z) */
jal x1, store_proj_randomize
/* Add points Q+P or Q+2P depending on offset in x27.
Q_a = ([w30,w29], [w28,w27], [w26, w25]) <= Q + dmem[x27] */
jal x1, proj_add_p384
/* load shares from scratchpad
[w1, w0] = s0; [w3, w2] = s1 */
li x2, 0
bn.lid x2++, 192(x30)
bn.lid x2++, 224(x30)
bn.lid x2++, 448(x30)
bn.lid x2++, 480(x30)
/* M = s0[511] | s1[511] */
bn.or w8, w1, w3
/* load q from scratchpad
Q = ([w9,w8], [w7,w6], [w5,w4]) <= dmem[x26] */
li x2, 4
bn.lid x2++, 0(x26)
bn.lid x2++, 32(x26)
bn.lid x2++, 64(x26)
bn.lid x2++, 96(x26)
bn.lid x2++, 128(x26)
bn.lid x2++, 160(x26)
/* select either Q or Q_a
if M: Q = ([w30,w29], [w28,w27], [w26, w25]) <= Q else: Q <= Q_a */
bn.sel w25, w25, w4, M
bn.sel w26, w26, w5, M
bn.sel w27, w27, w6, M
bn.sel w28, w28, w7, M
bn.sel w29, w29, w8, M
bn.sel w30, w30, w9, M
/* store Q in dmem
dmem[x26] = dmem[dptr_sc+512] <= [w30:w25] */
li x2, 25
bn.sid x2++, 0(x26)
bn.sid x2++, 32(x26)
bn.sid x2++, 64(x26)
bn.sid x2++, 96(x26)
bn.sid x2++, 128(x26)
bn.sid x2++, 160(x26)
/* left shift both shares
s0 <= s0 << 1 ; s1 <= s1 << 1 */
bn.add w0, w0, w0
bn.addc w1, w1, w1
bn.add w2, w2, w2
bn.addc w3, w3, w3
/* store both shares in scratchpad */
li x2, 0
bn.sid x2++, 192(x30)
bn.sid x2++, 224(x30)
bn.sid x2++, 448(x30)
bn.sid x2++, 480(x30)
/* Get a fresh random number and scale the coordinates of 2P.
(scaling each proj. coordinate by same factor results in same point) */
/* get a 384-bit random number */
bn.wsrr w2, 1
bn.wsrr w3, 1
bn.rshi w3, w31, w3 >> 128
/* reduce random number
[w2, w3] = z <= [w2, w3] mod p */
bn.sub w10, w2, w12
bn.subb w11, w3, w13
bn.sel w2, w2, w10, C
bn.sel w3, w3, w11, C
/* scale all coordinates in scratchpad */
li x2, 16
li x3, 17
/* x-coordinate */
bn.mov w10, w2
bn.mov w11, w3
bn.lid x2, 256(x30)
bn.lid x3, 288(x30)
jal x1, barrett384_p384
bn.sid x2, 256(x30)
bn.sid x3, 288(x30)
/* y-coordinate */
bn.mov w10, w2
bn.mov w11, w3
bn.lid x2, 320(x30)
bn.lid x3, 352(x30)
jal x1, barrett384_p384
bn.sid x2, 320(x30)
bn.sid x3, 352(x30)
/* z-coordinate */
bn.mov w10, w2
bn.mov w11, w3
bn.lid x2, 384(x30)
bn.lid x3, 416(x30)
jal x1, barrett384_p384
bn.sid x2, 384(x30)
bn.sid x3, 416(x30)
/* convert coordinates to affine space */
jal x1, proj_to_affine_p384
ret